CC <- read_csv("../data/crime_and_communities_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
CC <- as.data.frame(CC)

Dataset exploration

After the dataset has been imported, we want to preview it to get an idea of what we will be working with.

head(CC, 10)
##    population householdsize racepctblack racePctWhite racePctAsian racePctHisp
## 1       11980          3.10         1.37        91.78         6.50        1.88
## 2       23123          2.82         0.80        95.57         3.44        0.85
## 3       29344          2.43         0.74        94.33         3.43        2.35
## 4       16656          2.40         1.70        97.35         0.50        0.70
## 5      140494          2.45         2.51        95.65         0.90        0.95
## 6       28700          2.60         1.60        96.57         1.47        1.10
## 7       59459          2.45        14.20        84.87         0.40        0.63
## 8       74111          2.46         0.35        97.11         1.25        0.73
## 9      103590          2.62        23.14        67.60         0.92       16.35
## 10      31601          2.54        12.63        83.22         0.77        4.39
##    agePct12t21 agePct12t29 agePct16t24 agePct65up numbUrban pctUrban medIncome
## 1        12.47       21.44       10.93      11.33     11980      100     75122
## 2        11.01       21.30       10.48      17.18     23123      100     47917
## 3        11.36       25.88       11.01      10.28     29344      100     35669
## 4        12.55       25.20       12.19      17.57         0        0     20580
## 5        18.09       32.89       20.04      13.26    140494      100     21577
## 6        11.17       27.41       12.76      14.42     28700      100     42805
## 7        15.31       27.93       14.78      14.60     59449      100     23221
## 8        16.64       35.16       20.33       8.58     74115      100     25326
## 9        19.88       34.55       21.62      13.12    103590      100     17852
## 10       15.73       28.57       15.16      14.26     31596      100     24763
##    pctWWage pctWFarmSelf pctWInvInc pctWSocSec pctWPubAsst pctWRetire medFamInc
## 1     89.24         1.55      70.20      23.62        1.03      18.39     79584
## 2     78.99         1.11      64.11      35.50        2.75      22.85     55323
## 3     82.00         1.15      55.73      22.25        2.94      14.56     42112
## 4     68.15         0.24      38.95      39.48       11.71      18.33     26501
## 5     75.78         1.00      41.15      29.31        7.12      14.09     27705
## 6     79.47         0.39      47.70      30.23        5.41      17.23     50394
## 7     71.60         0.67      35.74      32.58        8.81      22.59     28901
## 8     83.69         2.93      47.11      19.30        4.21      10.31     34269
## 9     74.20         0.86      30.98      29.09        9.06      13.99     24058
## 10    73.92         1.54      37.36      32.68        7.02      15.20     29509
##    perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap OtherPerCap
## 1      29711       30233       13600         5725       27101        5115
## 2      20148       20191       18137            0       20074        5250
## 3      16946       17103       16644        21606       15528        5954
## 4      10810       10909        9984         4941        3541        2451
## 5      11878       12029        7382        10264       10753        7192
## 6      18193       18276       17342        21482       12639       21852
## 7      12161       12599        9820         6634        8802        7428
## 8      13554       13727        8852         5344        8011        5332
## 9      10195       12126        5715        11313        5770        7320
## 10     12929       14051        7496         9126        9107        5267
##    HispPerCap NumUnderPov PctPopUnderPov PctLess9thGrade PctNotHSGrad
## 1       22838         227           1.96            5.81         9.90
## 2       12222         885           3.98            5.61        13.72
## 3        8405        1389           4.75            2.80         9.09
## 4        4391        2831          17.23           11.05        33.68
## 5        8104       23223          17.78            8.76        23.03
## 6       22594        1126           4.01            4.49        13.89
## 7        6187       10320          17.98           10.09        28.67
## 8        5174        9603          13.68            5.52        11.27
## 9        6984       27767          28.68           13.01        31.62
## 10       4542        4698          15.61            9.07        24.86
##    PctBSorMore PctUnemployed PctEmploy PctEmplManu PctEmplProfServ PctOccupManu
## 1        48.18          2.70     64.55       14.65           28.82         5.49
## 2        29.89          2.43     61.96       12.26           29.28         6.39
## 3        30.13          4.01     69.80       15.95           21.52         8.79
## 4        10.81          9.86     54.74       31.22           27.43        26.76
## 5        20.66          5.72     59.02       14.31           26.83        14.72
## 6        27.01          4.85     65.42       14.02           27.17         8.50
## 7        12.00          8.19     56.59       27.00           21.54        21.92
## 8        30.24          4.18     68.51        6.89           31.55        11.37
## 9        17.02          8.39     51.37       15.73           29.06        16.43
## 10       19.23          7.19     57.76       25.33           27.59        15.74
##    PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv TotalPctDiv
## 1             50.73           3.67          26.38         5.22        4.47
## 2             37.64           4.23          27.99         6.45        5.42
## 3             32.48          10.10          25.78        14.76       12.55
## 4             22.71          10.98          28.15        14.47       12.91
## 5             23.42          11.40          33.32        14.46       13.04
## 6             32.78           5.97          36.05         9.06        7.64
## 7             18.02          13.28          28.34        16.33       14.94
## 8             29.43           7.29          40.87         9.94        8.64
## 9             24.30          11.07          38.49        14.66       12.97
## 10            27.28          11.48          27.60        15.26       13.53
##    PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par PctTeen2Par
## 1        3.22      91.43       90.17            95.78       95.81
## 2        3.11      86.91       85.33            96.82       86.46
## 3        2.95      78.54       78.85            92.37       75.72
## 4        2.98      64.02       62.36            65.38       67.43
## 5        2.89      71.94       69.79            79.76       75.33
## 6        3.14      79.53       79.76            92.05       77.12
## 7        2.95      62.56       58.70            69.89       62.76
## 8        3.00      79.35       79.70            86.60       80.70
## 9        3.11      61.65       54.56            68.85       61.69
## 10       2.99      68.41       64.64            75.18       70.94
##    PctWorkMomYoungKids PctWorkMom NumKidsBornNeverMar PctKidsBornNeverMar
## 1                44.56      58.88                  31                0.36
## 2                51.14      62.43                  43                0.24
## 3                66.08      74.19                 164                0.88
## 4                59.59      70.27                 561                3.84
## 5                62.96      70.52                1511                1.58
## 6                65.16      72.81                 263                1.18
## 7                63.08      72.44                2368                4.66
## 8                74.32      78.51                 751                1.64
## 9                60.80      69.23                3537                4.71
## 10               67.43      72.96                 603                2.47
##    NumImmig PctImmigRecent PctImmigRec5 PctImmigRec8 PctImmigRec10
## 1      1277           8.69        13.00        20.99         30.93
## 2      1920           5.21         8.65        13.33         22.50
## 3      1468          16.42        23.98        32.08         35.63
## 4       339          13.86        13.86        15.34         15.34
## 5      2091          21.33        30.56        38.02         45.48
## 6      2637          11.38        16.27        23.93         27.76
## 7       517          13.15        22.82        28.24         33.08
## 8      1474          23.68        33.58        46.68         53.93
## 9      4793          15.54        23.08        35.32         49.82
## 10      938          17.91        35.39        56.08         65.46
##    PctRecentImmig PctRecImmig5 PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly
## 1            0.93         1.39         2.24          3.30            85.68
## 2            0.43         0.72         1.11          1.87            87.79
## 3            0.82         1.20         1.61          1.78            93.11
## 4            0.28         0.28         0.31          0.31            94.98
## 5            0.32         0.45         0.57          0.68            96.87
## 6            1.05         1.49         2.20          2.55            89.98
## 7            0.11         0.20         0.25          0.29            97.43
## 8            0.47         0.67         0.93          1.07            95.21
## 9            0.72         1.07         1.63          2.31            85.72
## 10           0.53         1.05         1.66          1.94            94.85
##    PctNotSpeakEnglWell PctLargHouseFam PctLargHouseOccup PersPerOccupHous
## 1                 1.37            4.81              4.17             2.99
## 2                 1.81            4.25              3.34             2.70
## 3                 1.14            2.97              2.05             2.42
## 4                 0.56            3.93              2.56             2.37
## 5                 0.60            3.08              1.92             2.28
## 6                 0.60            5.08              3.46             2.55
## 7                 0.28            3.85              2.55             2.36
## 8                 0.43            2.59              1.54             2.32
## 9                 2.51            6.70              4.10             2.45
## 10                0.81            3.66              2.51             2.42
##    PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup PctPersDenseHous
## 1               3.00               2.84           91.46             0.39
## 2               2.83               1.96           89.03             1.01
## 3               2.69               2.06           64.18             2.03
## 4               2.51               2.20           58.18             1.21
## 5               2.37               2.16           57.81             2.11
## 6               2.89               2.09           64.62             1.47
## 7               2.42               2.27           65.29             1.90
## 8               2.77               1.91           57.42             1.67
## 9               2.47               2.44           46.82             6.14
## 10              2.50               2.31           59.76             3.41
##    PctHousLess3BR MedNumBR HousVacant PctHousOccup PctHousOwnOcc
## 1           11.06        3         64        98.37         91.01
## 2           23.60        3        240        97.15         84.88
## 3           47.46        3        544        95.68         57.79
## 4           45.66        3        669        91.19         54.89
## 5           53.19        2       5119        91.81         55.50
## 6           47.35        3        566        95.11         56.96
## 7           56.30        2       2051        92.22         63.82
## 8           59.32        2       1562        95.07         48.10
## 9           59.96        2       5606        87.57         46.51
## 10          52.11        2       1807        87.33         57.83
##    PctVacantBoarded PctVacMore6Mos MedYrHousBuilt PctHousNoPhone PctWOFullPlumb
## 1              3.12          37.50           1959           0.00           0.28
## 2              0.00          18.33           1958           0.31           0.14
## 3              0.92           7.54           1976           1.55           0.12
## 4              2.54          57.85           1939           7.00           0.87
## 5              2.09          26.22           1966           6.13           0.31
## 6              1.41          34.45           1956           0.69           0.28
## 7              6.39          56.36           1954           8.42           0.49
## 8              0.45          25.61           1971           2.66           0.19
## 9              5.64          37.57           1960          11.74           0.33
## 10             2.77          42.34           1965           7.89           0.30
##    OwnOccLowQuart OwnOccMedVal OwnOccHiQuart OwnOccQrange RentLowQ RentMedian
## 1          215900       262600        326900       111000      685       1001
## 2          136300       164200        199900        63600      467        560
## 3           74700        90400        112000        37300      370        428
## 4           36400        49600         66500        30100      195        250
## 5           37700        53900         73100        35400      215        280
## 6          155100       179000        215500        60400      463        669
## 7           26300        37000         52400        26100      186        253
## 8           54500        70300         93700        39200      241        321
## 9           28600        43100         67400        38800      192        281
## 10          32200        49800         73600        41400      234        305
##    RentHighQ RentQrange MedRent MedRentPctHousInc MedOwnCostPctInc
## 1       1001        316    1001              23.8             21.1
## 2        672        205     627              27.6             20.7
## 3        520        150     484              24.1             21.7
## 4        309        114     333              28.7             20.6
## 5        349        134     340              26.4             17.3
## 6        824        361     736              24.4             20.8
## 7        325        139     338              26.3             15.1
## 8        387        146     355              25.2             20.7
## 9        369        177     353              29.6             19.4
## 10       376        142     380              23.8             17.1
##    MedOwnCostPctIncNoMtg NumInShelters NumStreet PctForeignBorn
## 1                   14.0            11         0          10.66
## 2                   12.5             0         0           8.30
## 3                   11.6            16         0           5.00
## 4                   14.5             0         0           2.04
## 5                   11.7           327         4           1.49
## 6                   12.5             0         0           9.19
## 7                   12.2            21         0           0.87
## 8                   12.8           125        15           1.99
## 9                   13.0            43         4           4.63
## 10                  12.9             1         0           2.97
##    PctBornSameState PctSameHouse85 PctSameCity85 PctSameState85 LemasSwornFT
## 1             53.72          65.29         78.09          89.14           NA
## 2             77.17          71.27         90.22          96.12           NA
## 3             44.77          36.60         61.26          82.85           NA
## 4             88.71          56.70         90.17          96.24           NA
## 5             64.35          42.29         70.61          85.66           NA
## 6             77.30          63.45         82.23          93.53           NA
## 7             73.70          54.85         85.55          91.51           NA
## 8             58.82          40.72         67.97          81.39           NA
## 9             75.59          42.33         74.05          92.12          198
## 10            65.73          44.95         74.82          88.66           NA
##    LemasSwFTPerPop LemasSwFTFieldOps LemasSwFTFieldPerPop LemasTotalReq
## 1               NA                NA                   NA            NA
## 2               NA                NA                   NA            NA
## 3               NA                NA                   NA            NA
## 4               NA                NA                   NA            NA
## 5               NA                NA                   NA            NA
## 6               NA                NA                   NA            NA
## 7               NA                NA                   NA            NA
## 8               NA                NA                   NA            NA
## 9           183.53               187               173.33         73432
## 10              NA                NA                   NA            NA
##    LemasTotReqPerPop PolicReqPerOffic PolicPerPop RacialMatchCommPol
## 1                 NA               NA          NA                 NA
## 2                 NA               NA          NA                 NA
## 3                 NA               NA          NA                 NA
## 4                 NA               NA          NA                 NA
## 5                 NA               NA          NA                 NA
## 6                 NA               NA          NA                 NA
## 7                 NA               NA          NA                 NA
## 8                 NA               NA          NA                 NA
## 9            68065.1            370.9       183.5              89.32
## 10                NA               NA          NA                 NA
##    PctPolicWhite PctPolicBlack PctPolicHisp PctPolicAsian PctPolicMinor
## 1             NA            NA           NA            NA            NA
## 2             NA            NA           NA            NA            NA
## 3             NA            NA           NA            NA            NA
## 4             NA            NA           NA            NA            NA
## 5             NA            NA           NA            NA            NA
## 6             NA            NA           NA            NA            NA
## 7             NA            NA           NA            NA            NA
## 8             NA            NA           NA            NA            NA
## 9          78.28         11.11        10.61             0         21.72
## 10            NA            NA           NA            NA            NA
##    OfficAssgnDrugUnits NumKindsDrugsSeiz PolicAveOTWorked LandArea PopDens
## 1                   NA                NA               NA      6.5  1845.9
## 2                   NA                NA               NA     10.6  2186.7
## 3                   NA                NA               NA     10.6  2780.9
## 4                   NA                NA               NA      5.2  3217.7
## 5                   NA                NA               NA     70.4  1995.7
## 6                   NA                NA               NA     10.9  2643.5
## 7                   NA                NA               NA     39.2  1515.3
## 8                   NA                NA               NA     30.9  2399.3
## 9                   13                12             60.2     78.5  1319.3
## 10                  NA                NA               NA     38.7   816.1
##    PctUsePubTrans PolicCars PolicOperBudg LemasPctPolicOnPatr
## 1            9.63        NA            NA                  NA
## 2            3.84        NA            NA                  NA
## 3            4.37        NA            NA                  NA
## 4            3.31        NA            NA                  NA
## 5            0.97        NA            NA                  NA
## 6            9.62        NA            NA                  NA
## 7            0.70        NA            NA                  NA
## 8            1.41        NA            NA                  NA
## 9            0.76       100       9315474               94.44
## 10           0.00        NA            NA                  NA
##    LemasGangUnitDeploy LemasPctOfficDrugUn PolicBudgPerPop ViolentCrimesPerPop
## 1                   NA                0.00              NA               41.02
## 2                   NA                0.00              NA              127.56
## 3                   NA                0.00              NA              218.59
## 4                   NA                0.00              NA              306.64
## 5                   NA                0.00              NA              442.95
## 6                   NA                0.00              NA              226.63
## 7                   NA                0.00              NA              439.73
## 8                   NA                0.00              NA              115.31
## 9                   10                6.57         86346.3             1544.24
## 10                  NA                0.00              NA              722.02

Next, we will learn more about our dataset’s nuances. Some of the attributes we would like to pay attention to include the dimension, the number of categorical versus numerical variables, and how many missing values there are. Below is a high level overview which includes the attributes of interest as well as some other useful to know information about our data, such as memory. This can be done by using the glimpse function from the dplyr package and the introduce function from the DataExploration package.

#This is like a transposed version of print: columns run down the page, and data runs across. This makes it possible to see every column in a data frame, essentially trying to show as much data as possible
glimpse(CC)
## Observations: 1,994
## Variables: 125
## $ population            <dbl> 11980, 23123, 29344, 16656, 140494, 28700, 5945…
## $ householdsize         <dbl> 3.10, 2.82, 2.43, 2.40, 2.45, 2.60, 2.45, 2.46,…
## $ racepctblack          <dbl> 1.37, 0.80, 0.74, 1.70, 2.51, 1.60, 14.20, 0.35…
## $ racePctWhite          <dbl> 91.78, 95.57, 94.33, 97.35, 95.65, 96.57, 84.87…
## $ racePctAsian          <dbl> 6.50, 3.44, 3.43, 0.50, 0.90, 1.47, 0.40, 1.25,…
## $ racePctHisp           <dbl> 1.88, 0.85, 2.35, 0.70, 0.95, 1.10, 0.63, 0.73,…
## $ agePct12t21           <dbl> 12.47, 11.01, 11.36, 12.55, 18.09, 11.17, 15.31…
## $ agePct12t29           <dbl> 21.44, 21.30, 25.88, 25.20, 32.89, 27.41, 27.93…
## $ agePct16t24           <dbl> 10.93, 10.48, 11.01, 12.19, 20.04, 12.76, 14.78…
## $ agePct65up            <dbl> 11.33, 17.18, 10.28, 17.57, 13.26, 14.42, 14.60…
## $ numbUrban             <dbl> 11980, 23123, 29344, 0, 140494, 28700, 59449, 7…
## $ pctUrban              <dbl> 100.00, 100.00, 100.00, 0.00, 100.00, 100.00, 1…
## $ medIncome             <dbl> 75122, 47917, 35669, 20580, 21577, 42805, 23221…
## $ pctWWage              <dbl> 89.24, 78.99, 82.00, 68.15, 75.78, 79.47, 71.60…
## $ pctWFarmSelf          <dbl> 1.55, 1.11, 1.15, 0.24, 1.00, 0.39, 0.67, 2.93,…
## $ pctWInvInc            <dbl> 70.20, 64.11, 55.73, 38.95, 41.15, 47.70, 35.74…
## $ pctWSocSec            <dbl> 23.62, 35.50, 22.25, 39.48, 29.31, 30.23, 32.58…
## $ pctWPubAsst           <dbl> 1.03, 2.75, 2.94, 11.71, 7.12, 5.41, 8.81, 4.21…
## $ pctWRetire            <dbl> 18.39, 22.85, 14.56, 18.33, 14.09, 17.23, 22.59…
## $ medFamInc             <dbl> 79584, 55323, 42112, 26501, 27705, 50394, 28901…
## $ perCapInc             <dbl> 29711, 20148, 16946, 10810, 11878, 18193, 12161…
## $ whitePerCap           <dbl> 30233, 20191, 17103, 10909, 12029, 18276, 12599…
## $ blackPerCap           <dbl> 13600, 18137, 16644, 9984, 7382, 17342, 9820, 8…
## $ indianPerCap          <dbl> 5725, 0, 21606, 4941, 10264, 21482, 6634, 5344,…
## $ AsianPerCap           <dbl> 27101, 20074, 15528, 3541, 10753, 12639, 8802, …
## $ OtherPerCap           <dbl> 5115, 5250, 5954, 2451, 7192, 21852, 7428, 5332…
## $ HispPerCap            <dbl> 22838, 12222, 8405, 4391, 8104, 22594, 6187, 51…
## $ NumUnderPov           <dbl> 227, 885, 1389, 2831, 23223, 1126, 10320, 9603,…
## $ PctPopUnderPov        <dbl> 1.96, 3.98, 4.75, 17.23, 17.78, 4.01, 17.98, 13…
## $ PctLess9thGrade       <dbl> 5.81, 5.61, 2.80, 11.05, 8.76, 4.49, 10.09, 5.5…
## $ PctNotHSGrad          <dbl> 9.90, 13.72, 9.09, 33.68, 23.03, 13.89, 28.67, …
## $ PctBSorMore           <dbl> 48.18, 29.89, 30.13, 10.81, 20.66, 27.01, 12.00…
## $ PctUnemployed         <dbl> 2.70, 2.43, 4.01, 9.86, 5.72, 4.85, 8.19, 4.18,…
## $ PctEmploy             <dbl> 64.55, 61.96, 69.80, 54.74, 59.02, 65.42, 56.59…
## $ PctEmplManu           <dbl> 14.65, 12.26, 15.95, 31.22, 14.31, 14.02, 27.00…
## $ PctEmplProfServ       <dbl> 28.82, 29.28, 21.52, 27.43, 26.83, 27.17, 21.54…
## $ PctOccupManu          <dbl> 5.49, 6.39, 8.79, 26.76, 14.72, 8.50, 21.92, 11…
## $ PctOccupMgmtProf      <dbl> 50.73, 37.64, 32.48, 22.71, 23.42, 32.78, 18.02…
## $ MalePctDivorce        <dbl> 3.67, 4.23, 10.10, 10.98, 11.40, 5.97, 13.28, 7…
## $ MalePctNevMarr        <dbl> 26.38, 27.99, 25.78, 28.15, 33.32, 36.05, 28.34…
## $ FemalePctDiv          <dbl> 5.22, 6.45, 14.76, 14.47, 14.46, 9.06, 16.33, 9…
## $ TotalPctDiv           <dbl> 4.47, 5.42, 12.55, 12.91, 13.04, 7.64, 14.94, 8…
## $ PersPerFam            <dbl> 3.22, 3.11, 2.95, 2.98, 2.89, 3.14, 2.95, 3.00,…
## $ PctFam2Par            <dbl> 91.43, 86.91, 78.54, 64.02, 71.94, 79.53, 62.56…
## $ PctKids2Par           <dbl> 90.17, 85.33, 78.85, 62.36, 69.79, 79.76, 58.70…
## $ PctYoungKids2Par      <dbl> 95.78, 96.82, 92.37, 65.38, 79.76, 92.05, 69.89…
## $ PctTeen2Par           <dbl> 95.81, 86.46, 75.72, 67.43, 75.33, 77.12, 62.76…
## $ PctWorkMomYoungKids   <dbl> 44.56, 51.14, 66.08, 59.59, 62.96, 65.16, 63.08…
## $ PctWorkMom            <dbl> 58.88, 62.43, 74.19, 70.27, 70.52, 72.81, 72.44…
## $ NumKidsBornNeverMar   <dbl> 31, 43, 164, 561, 1511, 263, 2368, 751, 3537, 6…
## $ PctKidsBornNeverMar   <dbl> 0.36, 0.24, 0.88, 3.84, 1.58, 1.18, 4.66, 1.64,…
## $ NumImmig              <dbl> 1277, 1920, 1468, 339, 2091, 2637, 517, 1474, 4…
## $ PctImmigRecent        <dbl> 8.69, 5.21, 16.42, 13.86, 21.33, 11.38, 13.15, …
## $ PctImmigRec5          <dbl> 13.00, 8.65, 23.98, 13.86, 30.56, 16.27, 22.82,…
## $ PctImmigRec8          <dbl> 20.99, 13.33, 32.08, 15.34, 38.02, 23.93, 28.24…
## $ PctImmigRec10         <dbl> 30.93, 22.50, 35.63, 15.34, 45.48, 27.76, 33.08…
## $ PctRecentImmig        <dbl> 0.93, 0.43, 0.82, 0.28, 0.32, 1.05, 0.11, 0.47,…
## $ PctRecImmig5          <dbl> 1.39, 0.72, 1.20, 0.28, 0.45, 1.49, 0.20, 0.67,…
## $ PctRecImmig8          <dbl> 2.24, 1.11, 1.61, 0.31, 0.57, 2.20, 0.25, 0.93,…
## $ PctRecImmig10         <dbl> 3.30, 1.87, 1.78, 0.31, 0.68, 2.55, 0.29, 1.07,…
## $ PctSpeakEnglOnly      <dbl> 85.68, 87.79, 93.11, 94.98, 96.87, 89.98, 97.43…
## $ PctNotSpeakEnglWell   <dbl> 1.37, 1.81, 1.14, 0.56, 0.60, 0.60, 0.28, 0.43,…
## $ PctLargHouseFam       <dbl> 4.81, 4.25, 2.97, 3.93, 3.08, 5.08, 3.85, 2.59,…
## $ PctLargHouseOccup     <dbl> 4.17, 3.34, 2.05, 2.56, 1.92, 3.46, 2.55, 1.54,…
## $ PersPerOccupHous      <dbl> 2.99, 2.70, 2.42, 2.37, 2.28, 2.55, 2.36, 2.32,…
## $ PersPerOwnOccHous     <dbl> 3.00, 2.83, 2.69, 2.51, 2.37, 2.89, 2.42, 2.77,…
## $ PersPerRentOccHous    <dbl> 2.84, 1.96, 2.06, 2.20, 2.16, 2.09, 2.27, 1.91,…
## $ PctPersOwnOccup       <dbl> 91.46, 89.03, 64.18, 58.18, 57.81, 64.62, 65.29…
## $ PctPersDenseHous      <dbl> 0.39, 1.01, 2.03, 1.21, 2.11, 1.47, 1.90, 1.67,…
## $ PctHousLess3BR        <dbl> 11.06, 23.60, 47.46, 45.66, 53.19, 47.35, 56.30…
## $ MedNumBR              <dbl> 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3,…
## $ HousVacant            <dbl> 64, 240, 544, 669, 5119, 566, 2051, 1562, 5606,…
## $ PctHousOccup          <dbl> 98.37, 97.15, 95.68, 91.19, 91.81, 95.11, 92.22…
## $ PctHousOwnOcc         <dbl> 91.01, 84.88, 57.79, 54.89, 55.50, 56.96, 63.82…
## $ PctVacantBoarded      <dbl> 3.12, 0.00, 0.92, 2.54, 2.09, 1.41, 6.39, 0.45,…
## $ PctVacMore6Mos        <dbl> 37.50, 18.33, 7.54, 57.85, 26.22, 34.45, 56.36,…
## $ MedYrHousBuilt        <dbl> 1959, 1958, 1976, 1939, 1966, 1956, 1954, 1971,…
## $ PctHousNoPhone        <dbl> 0.00, 0.31, 1.55, 7.00, 6.13, 0.69, 8.42, 2.66,…
## $ PctWOFullPlumb        <dbl> 0.28, 0.14, 0.12, 0.87, 0.31, 0.28, 0.49, 0.19,…
## $ OwnOccLowQuart        <dbl> 215900, 136300, 74700, 36400, 37700, 155100, 26…
## $ OwnOccMedVal          <dbl> 262600, 164200, 90400, 49600, 53900, 179000, 37…
## $ OwnOccHiQuart         <dbl> 326900, 199900, 112000, 66500, 73100, 215500, 5…
## $ OwnOccQrange          <dbl> 111000, 63600, 37300, 30100, 35400, 60400, 2610…
## $ RentLowQ              <dbl> 685, 467, 370, 195, 215, 463, 186, 241, 192, 23…
## $ RentMedian            <dbl> 1001, 560, 428, 250, 280, 669, 253, 321, 281, 3…
## $ RentHighQ             <dbl> 1001, 672, 520, 309, 349, 824, 325, 387, 369, 3…
## $ RentQrange            <dbl> 316, 205, 150, 114, 134, 361, 139, 146, 177, 14…
## $ MedRent               <dbl> 1001, 627, 484, 333, 340, 736, 338, 355, 353, 3…
## $ MedRentPctHousInc     <dbl> 23.8, 27.6, 24.1, 28.7, 26.4, 24.4, 26.3, 25.2,…
## $ MedOwnCostPctInc      <dbl> 21.1, 20.7, 21.7, 20.6, 17.3, 20.8, 15.1, 20.7,…
## $ MedOwnCostPctIncNoMtg <dbl> 14.0, 12.5, 11.6, 14.5, 11.7, 12.5, 12.2, 12.8,…
## $ NumInShelters         <dbl> 11, 0, 16, 0, 327, 0, 21, 125, 43, 1, 20, 28, 2…
## $ NumStreet             <dbl> 0, 0, 0, 0, 4, 0, 0, 15, 4, 0, 49, 2, 0, 1, 17,…
## $ PctForeignBorn        <dbl> 10.66, 8.30, 5.00, 2.04, 1.49, 9.19, 0.87, 1.99…
## $ PctBornSameState      <dbl> 53.72, 77.17, 44.77, 88.71, 64.35, 77.30, 73.70…
## $ PctSameHouse85        <dbl> 65.29, 71.27, 36.60, 56.70, 42.29, 63.45, 54.85…
## $ PctSameCity85         <dbl> 78.09, 90.22, 61.26, 90.17, 70.61, 82.23, 85.55…
## $ PctSameState85        <dbl> 89.14, 96.12, 82.85, 96.24, 85.66, 93.53, 91.51…
## $ LemasSwornFT          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 198, NA, NA, NA…
## $ LemasSwFTPerPop       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 183.53, NA, NA,…
## $ LemasSwFTFieldOps     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 187, NA, NA, NA…
## $ LemasSwFTFieldPerPop  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 173.33, NA, NA,…
## $ LemasTotalReq         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 73432, NA, NA, …
## $ LemasTotReqPerPop     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 68065.1, NA, NA…
## $ PolicReqPerOffic      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 370.9, NA, NA, …
## $ PolicPerPop           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 183.5, NA, NA, …
## $ RacialMatchCommPol    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 89.32, NA, NA, …
## $ PctPolicWhite         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 78.28, NA, NA, …
## $ PctPolicBlack         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 11.11, NA, NA, …
## $ PctPolicHisp          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 10.61, NA, NA, …
## $ PctPolicAsian         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 0.00, NA, NA, N…
## $ PctPolicMinor         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 21.72, NA, NA, …
## $ OfficAssgnDrugUnits   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 13, NA, NA, NA,…
## $ NumKindsDrugsSeiz     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 12, NA, NA, NA,…
## $ PolicAveOTWorked      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 60.2, NA, NA, N…
## $ LandArea              <dbl> 6.5, 10.6, 10.6, 5.2, 70.4, 10.9, 39.2, 30.9, 7…
## $ PopDens               <dbl> 1845.9, 2186.7, 2780.9, 3217.7, 1995.7, 2643.5,…
## $ PctUsePubTrans        <dbl> 9.63, 3.84, 4.37, 3.31, 0.97, 9.62, 0.70, 1.41,…
## $ PolicCars             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 100, NA, NA, NA…
## $ PolicOperBudg         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 9315474, NA, NA…
## $ LemasPctPolicOnPatr   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 94.44, NA, NA, …
## $ LemasGangUnitDeploy   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 10, NA, NA, NA,…
## $ LemasPctOfficDrugUn   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,…
## $ PolicBudgPerPop       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 86346.3, NA, NA…
## $ ViolentCrimesPerPop   <dbl> 41.02, 127.56, 218.59, 306.64, 442.95, 226.63, …
#summarize key attributes of the data
introduce(CC)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1 1994     125                0                125                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                36851           319             249250      2020864

Here, “complete_rows” refers to the number of rows without any missing values, “all_missing_columns” refers to the number of missing columns (the entire column is NA), “total_observations” refers to each value in the dataset (including missing values), and “discrete_columns” refers to the number of categorical variables in our data. From our findings, we see that there are no cateogorial variables. Additionally, out of the 249250 values, there are a total of 36851 missing values.

We can visualize our findings with a barplot that gives us the proportions of each attribute. The plot_intro function from the DataExploration package lets us do just that.

plot_intro(CC)

From our visualization, it becomes clear that:
1. Only roughly 16% of all rows are not completely missing
2. About 15% of the values in the dataset are missing
Missing values definitely will cause problems. Hence the next step will be to take a closer look at what is missing and what we can do to alleviate the problem.

The following code breaks down the number of missing values by variable with a for loop, looping over each coloumn in our dataset.

na_per_col <- c()
for(i in 1:ncol(CC)){
  na_per_col[i] <- sum(is.na(CC[,i]))
}
names(na_per_col) <- names(CC)
na_per_col
##            population         householdsize          racepctblack 
##                     0                     0                     0 
##          racePctWhite          racePctAsian           racePctHisp 
##                     0                     0                     0 
##           agePct12t21           agePct12t29           agePct16t24 
##                     0                     0                     0 
##            agePct65up             numbUrban              pctUrban 
##                     0                     0                     0 
##             medIncome              pctWWage          pctWFarmSelf 
##                     0                     0                     0 
##            pctWInvInc            pctWSocSec           pctWPubAsst 
##                     0                     0                     0 
##            pctWRetire             medFamInc             perCapInc 
##                     0                     0                     0 
##           whitePerCap           blackPerCap          indianPerCap 
##                     0                     0                     0 
##           AsianPerCap           OtherPerCap            HispPerCap 
##                     0                     1                     0 
##           NumUnderPov        PctPopUnderPov       PctLess9thGrade 
##                     0                     0                     0 
##          PctNotHSGrad           PctBSorMore         PctUnemployed 
##                     0                     0                     0 
##             PctEmploy           PctEmplManu       PctEmplProfServ 
##                     0                     0                     0 
##          PctOccupManu      PctOccupMgmtProf        MalePctDivorce 
##                     0                     0                     0 
##        MalePctNevMarr          FemalePctDiv           TotalPctDiv 
##                     0                     0                     0 
##            PersPerFam            PctFam2Par           PctKids2Par 
##                     0                     0                     0 
##      PctYoungKids2Par           PctTeen2Par   PctWorkMomYoungKids 
##                     0                     0                     0 
##            PctWorkMom   NumKidsBornNeverMar   PctKidsBornNeverMar 
##                     0                     0                     0 
##              NumImmig        PctImmigRecent          PctImmigRec5 
##                     0                     0                     0 
##          PctImmigRec8         PctImmigRec10        PctRecentImmig 
##                     0                     0                     0 
##          PctRecImmig5          PctRecImmig8         PctRecImmig10 
##                     0                     0                     0 
##      PctSpeakEnglOnly   PctNotSpeakEnglWell       PctLargHouseFam 
##                     0                     0                     0 
##     PctLargHouseOccup      PersPerOccupHous     PersPerOwnOccHous 
##                     0                     0                     0 
##    PersPerRentOccHous       PctPersOwnOccup      PctPersDenseHous 
##                     0                     0                     0 
##        PctHousLess3BR              MedNumBR            HousVacant 
##                     0                     0                     0 
##          PctHousOccup         PctHousOwnOcc      PctVacantBoarded 
##                     0                     0                     0 
##        PctVacMore6Mos        MedYrHousBuilt        PctHousNoPhone 
##                     0                     0                     0 
##        PctWOFullPlumb        OwnOccLowQuart          OwnOccMedVal 
##                     0                     0                     0 
##         OwnOccHiQuart          OwnOccQrange              RentLowQ 
##                     0                     0                     0 
##            RentMedian             RentHighQ            RentQrange 
##                     0                     0                     0 
##               MedRent     MedRentPctHousInc      MedOwnCostPctInc 
##                     0                     0                     0 
## MedOwnCostPctIncNoMtg         NumInShelters             NumStreet 
##                     0                     0                     0 
##        PctForeignBorn      PctBornSameState        PctSameHouse85 
##                     0                     0                     0 
##         PctSameCity85        PctSameState85          LemasSwornFT 
##                     0                     0                  1675 
##       LemasSwFTPerPop     LemasSwFTFieldOps  LemasSwFTFieldPerPop 
##                  1675                  1675                  1675 
##         LemasTotalReq     LemasTotReqPerPop      PolicReqPerOffic 
##                  1675                  1675                  1675 
##           PolicPerPop    RacialMatchCommPol         PctPolicWhite 
##                  1675                  1675                  1675 
##         PctPolicBlack          PctPolicHisp         PctPolicAsian 
##                  1675                  1675                  1675 
##         PctPolicMinor   OfficAssgnDrugUnits     NumKindsDrugsSeiz 
##                  1675                  1675                  1675 
##      PolicAveOTWorked              LandArea               PopDens 
##                  1675                     0                     0 
##        PctUsePubTrans             PolicCars         PolicOperBudg 
##                     0                  1675                  1675 
##   LemasPctPolicOnPatr   LemasGangUnitDeploy   LemasPctOfficDrugUn 
##                  1675                  1675                     0 
##       PolicBudgPerPop   ViolentCrimesPerPop 
##                  1675                     0

This can be visualized as well with a barplot which illustrates the proportion of missing values for each variable given that the variable contains missing values. The plot_missing function from the DataExploration package allows us to accomplish this.

plot_missing(CC[,na_per_col!= 0])

From our illustration, we observe that out of the 23 variables that contain missing values, all of them except for “OtherPerCap” has a significant portion of missing values. Because those variables are mostly missing values, we can drop them using the drop_columns function since they probably will not provide us with much information. Doing this drop reduces the number of variables in our dataset to 103 (previously 125) as shown with the dim function.

cleaned_CC <- drop_columns(CC, names(na_per_col)[na_per_col == 1675])
cleaned_CC <- as.data.frame(cleaned_CC)
dim(cleaned_CC)
## [1] 1994  103

Now that we have learned and cleaned our dataset a little bit, we can get a general summary of our data using the summary function. This will provide us with some useful summary statistics of our variables as well as give us insight as to how they are distributed. We can visualize these distributions by plotting the histograms corresponding to each variable.

summary(cleaned_CC)
##    population      householdsize    racepctblack    racePctWhite  
##  Min.   :  10005   Min.   :1.600   Min.   : 0.00   Min.   : 2.68  
##  1st Qu.:  14359   1st Qu.:2.490   1st Qu.: 0.94   1st Qu.:75.88  
##  Median :  22681   Median :2.650   Median : 3.15   Median :89.61  
##  Mean   :  52251   Mean   :2.707   Mean   : 9.51   Mean   :83.49  
##  3rd Qu.:  43154   3rd Qu.:2.850   3rd Qu.:11.96   3rd Qu.:95.99  
##  Max.   :7322564   Max.   :5.280   Max.   :96.67   Max.   :99.63  
##                                                                   
##   racePctAsian      racePctHisp      agePct12t21     agePct12t29   
##  Min.   : 0.0300   Min.   : 0.120   Min.   : 4.58   Min.   : 9.38  
##  1st Qu.: 0.6125   1st Qu.: 0.920   1st Qu.:12.23   1st Qu.:24.38  
##  Median : 1.2400   Median : 2.340   Median :13.62   Median :26.77  
##  Mean   : 2.7508   Mean   : 8.482   Mean   :14.43   Mean   :27.62  
##  3rd Qu.: 2.7375   3rd Qu.: 8.610   3rd Qu.:15.39   3rd Qu.:29.18  
##  Max.   :57.4600   Max.   :95.290   Max.   :54.40   Max.   :70.51  
##                                                                    
##   agePct16t24      agePct65up       numbUrban          pctUrban     
##  Min.   : 4.64   Min.   : 1.660   Min.   :      0   Min.   :  0.00  
##  1st Qu.:11.34   1st Qu.: 8.922   1st Qu.:      0   1st Qu.:  0.00  
##  Median :12.54   Median :11.855   Median :  17348   Median :100.00  
##  Mean   :13.99   Mean   :12.005   Mean   :  46672   Mean   : 69.62  
##  3rd Qu.:14.36   3rd Qu.:14.547   3rd Qu.:  41932   3rd Qu.:100.00  
##  Max.   :63.62   Max.   :52.770   Max.   :7322564   Max.   :100.00  
##                                                                     
##    medIncome         pctWWage      pctWFarmSelf      pctWInvInc   
##  Min.   : 11576   Min.   :31.68   Min.   :0.0000   Min.   : 7.91  
##  1st Qu.: 23597   1st Qu.:73.22   1st Qu.:0.4700   1st Qu.:34.19  
##  Median : 30896   Median :78.38   Median :0.7000   Median :42.38  
##  Mean   : 33699   Mean   :78.08   Mean   :0.8933   Mean   :43.36  
##  3rd Qu.: 41215   3rd Qu.:83.70   3rd Qu.:1.1100   3rd Qu.:52.07  
##  Max.   :123625   Max.   :96.62   Max.   :6.5300   Max.   :89.04  
##                                                                   
##    pctWSocSec     pctWPubAsst       pctWRetire      medFamInc     
##  Min.   : 4.81   Min.   : 0.500   Min.   : 3.46   Min.   : 13785  
##  1st Qu.:20.98   1st Qu.: 3.362   1st Qu.:12.99   1st Qu.: 29307  
##  Median :26.79   Median : 5.720   Median :15.66   Median : 36010  
##  Mean   :26.66   Mean   : 6.806   Mean   :16.06   Mean   : 39553  
##  3rd Qu.:31.84   3rd Qu.: 9.150   3rd Qu.:18.78   3rd Qu.: 46683  
##  Max.   :76.39   Max.   :26.920   Max.   :45.51   Max.   :131315  
##                                                                   
##    perCapInc      whitePerCap     blackPerCap      indianPerCap   
##  Min.   : 5237   Min.   : 5472   Min.   :     0   Min.   :     0  
##  1st Qu.:11548   1st Qu.:12596   1st Qu.:  6706   1st Qu.:  6336  
##  Median :13977   Median :15028   Median :  9664   Median :  9834  
##  Mean   :15522   Mean   :16535   Mean   : 11472   Mean   : 12257  
##  3rd Qu.:17774   3rd Qu.:18610   3rd Qu.: 14464   3rd Qu.: 14690  
##  Max.   :63302   Max.   :68850   Max.   :212120   Max.   :480000  
##                                                                   
##   AsianPerCap      OtherPerCap       HispPerCap     NumUnderPov       
##  Min.   :     0   Min.   :     0   Min.   :    0   Min.   :     78.0  
##  1st Qu.:  8441   1st Qu.:  5500   1st Qu.: 7253   1st Qu.:    936.2  
##  Median : 12331   Median :  8144   Median : 9676   Median :   2217.5  
##  Mean   : 14284   Mean   :  9375   Mean   :10989   Mean   :   7398.4  
##  3rd Qu.: 17346   3rd Qu.: 11378   3rd Qu.:13360   3rd Qu.:   5097.5  
##  Max.   :106165   Max.   :137000   Max.   :54648   Max.   :1384994.0  
##                   NA's   :1                                           
##  PctPopUnderPov   PctLess9thGrade   PctNotHSGrad    PctBSorMore   
##  Min.   : 0.640   Min.   : 0.200   Min.   : 2.09   Min.   : 1.63  
##  1st Qu.: 4.692   1st Qu.: 4.770   1st Qu.:14.20   1st Qu.:14.09  
##  Median : 9.650   Median : 7.920   Median :21.66   Median :19.62  
##  Mean   :11.796   Mean   : 9.444   Mean   :22.70   Mean   :22.99  
##  3rd Qu.:17.078   3rd Qu.:12.245   3rd Qu.:29.66   3rd Qu.:28.93  
##  Max.   :48.820   Max.   :49.890   Max.   :73.66   Max.   :73.63  
##                                                                   
##  PctUnemployed      PctEmploy      PctEmplManu    PctEmplProfServ
##  Min.   : 1.320   Min.   :24.82   Min.   : 2.05   Min.   : 8.69  
##  1st Qu.: 4.090   1st Qu.:56.35   1st Qu.:11.94   1st Qu.:20.11  
##  Median : 5.485   Median :62.27   Median :16.66   Median :23.41  
##  Mean   : 6.024   Mean   :61.78   Mean   :17.79   Mean   :24.58  
##  3rd Qu.: 7.430   3rd Qu.:67.50   3rd Qu.:22.75   3rd Qu.:27.63  
##  Max.   :23.830   Max.   :84.67   Max.   :50.03   Max.   :62.67  
##                                                                  
##   PctOccupManu    PctOccupMgmtProf MalePctDivorce   MalePctNevMarr 
##  Min.   : 1.370   Min.   : 6.48    Min.   : 2.130   Min.   :12.06  
##  1st Qu.: 9.072   1st Qu.:21.92    1st Qu.: 7.162   1st Qu.:25.41  
##  Median :13.040   Median :26.30    Median : 9.240   Median :29.00  
##  Mean   :13.747   Mean   :28.25    Mean   : 9.180   Mean   :30.67  
##  3rd Qu.:17.465   3rd Qu.:32.89    3rd Qu.:11.110   3rd Qu.:33.47  
##  Max.   :44.270   Max.   :64.97    Max.   :19.090   Max.   :76.32  
##                                                                    
##   FemalePctDiv    TotalPctDiv      PersPerFam      PctFam2Par   
##  Min.   : 3.35   Min.   : 2.83   Min.   :2.290   Min.   :32.24  
##  1st Qu.: 9.94   1st Qu.: 8.64   1st Qu.:2.990   1st Qu.:67.67  
##  Median :12.63   Median :11.04   Median :3.095   Median :74.77  
##  Mean   :12.40   Mean   :10.88   Mean   :3.129   Mean   :73.90  
##  3rd Qu.:14.80   3rd Qu.:13.06   3rd Qu.:3.220   3rd Qu.:81.64  
##  Max.   :23.46   Max.   :19.11   Max.   :4.640   Max.   :93.60  
##                                                                 
##   PctKids2Par    PctYoungKids2Par  PctTeen2Par    PctWorkMomYoungKids
##  Min.   :26.11   Min.   : 27.43   Min.   :30.64   Min.   :24.42      
##  1st Qu.:63.62   1st Qu.: 74.42   1st Qu.:69.92   1st Qu.:55.45      
##  Median :72.06   Median : 83.77   Median :76.67   Median :60.70      
##  Mean   :70.91   Mean   : 81.75   Mean   :75.34   Mean   :60.43      
##  3rd Qu.:79.82   3rd Qu.: 91.44   3rd Qu.:82.52   3rd Qu.:65.80      
##  Max.   :92.58   Max.   :100.00   Max.   :97.34   Max.   :87.97      
##                                                                      
##    PctWorkMom    NumKidsBornNeverMar PctKidsBornNeverMar    NumImmig      
##  Min.   :41.95   Min.   :     0.0    Min.   : 0.000      Min.   :     20  
##  1st Qu.:64.96   1st Qu.:   146.2    1st Qu.: 1.083      1st Qu.:    407  
##  Median :69.25   Median :   361.0    Median : 2.080      Median :   1040  
##  Mean   :68.80   Mean   :  2041.5    Mean   : 3.140      Mean   :   6314  
##  3rd Qu.:73.34   3rd Qu.:  1070.2    3rd Qu.: 3.980      3rd Qu.:   3389  
##  Max.   :89.37   Max.   :527557.0    Max.   :24.190      Max.   :2082931  
##                                                                           
##  PctImmigRecent    PctImmigRec5    PctImmigRec8   PctImmigRec10  
##  Min.   : 0.000   Min.   : 0.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 6.942   1st Qu.:11.70   1st Qu.:17.91   1st Qu.:23.54  
##  Median :12.440   Median :19.64   Median :27.46   Median :35.58  
##  Mean   :13.734   Mean   :20.83   Mean   :28.12   Mean   :35.48  
##  3rd Qu.:18.090   3rd Qu.:27.69   3rd Qu.:37.07   3rd Qu.:46.81  
##  Max.   :64.290   Max.   :76.16   Max.   :80.81   Max.   :88.00  
##                                                                  
##  PctRecentImmig    PctRecImmig5     PctRecImmig8    PctRecImmig10   
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.180   1st Qu.: 0.290   1st Qu.: 0.410   1st Qu.: 0.540  
##  Median : 0.530   Median : 0.780   Median : 1.080   Median : 1.380  
##  Mean   : 1.149   Mean   : 1.781   Mean   : 2.424   Mean   : 3.094  
##  3rd Qu.: 1.370   3rd Qu.: 2.180   3rd Qu.: 2.870   3rd Qu.: 3.680  
##  Max.   :13.710   Max.   :19.930   Max.   :25.340   Max.   :32.630  
##                                                                     
##  PctSpeakEnglOnly PctNotSpeakEnglWell PctLargHouseFam  PctLargHouseOccup
##  Min.   : 6.15    Min.   : 0.000      Min.   : 0.960   Min.   : 0.440   
##  1st Qu.:83.70    1st Qu.: 0.510      1st Qu.: 3.390   1st Qu.: 2.360   
##  Median :91.78    Median : 0.955      Median : 4.290   Median : 3.050   
##  Mean   :86.55    Mean   : 2.538      Mean   : 5.465   Mean   : 3.975   
##  3rd Qu.:95.41    3rd Qu.: 2.467      3rd Qu.: 5.957   3rd Qu.: 4.280   
##  Max.   :98.98    Max.   :38.330      Max.   :34.870   Max.   :30.870   
##                                                                         
##  PersPerOccupHous PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup
##  Min.   :1.580    Min.   :1.610     Min.   :1.580      Min.   :13.93  
##  1st Qu.:2.400    1st Qu.:2.540     1st Qu.:2.120      1st Qu.:56.56  
##  Median :2.560    Median :2.700     Median :2.290      Median :64.99  
##  Mean   :2.614    Mean   :2.734     Mean   :2.382      Mean   :65.50  
##  3rd Qu.:2.770    3rd Qu.:2.890     3rd Qu.:2.540      3rd Qu.:75.30  
##  Max.   :4.520    Max.   :4.480     Max.   :4.730      Max.   :96.59  
##                                                                       
##  PctPersDenseHous PctHousLess3BR     MedNumBR       HousVacant      
##  Min.   : 0.050   Min.   : 3.06   Min.   :1.000   Min.   :    36.0  
##  1st Qu.: 1.300   1st Qu.:37.93   1st Qu.:2.000   1st Qu.:   310.0  
##  Median : 2.470   Median :46.78   Median :3.000   Median :   582.5  
##  Mean   : 4.325   Mean   :45.84   Mean   :2.626   Mean   :  1733.0  
##  3rd Qu.: 4.920   3rd Qu.:54.09   3rd Qu.:3.000   3rd Qu.:  1280.5  
##  Max.   :59.490   Max.   :95.34   Max.   :4.000   Max.   :172768.0  
##                                                                     
##   PctHousOccup   PctHousOwnOcc   PctVacantBoarded PctVacMore6Mos 
##  Min.   :37.47   Min.   :16.86   Min.   : 0.000   Min.   : 3.12  
##  1st Qu.:90.98   1st Qu.:54.09   1st Qu.: 0.780   1st Qu.:24.74  
##  Median :93.98   Median :62.08   Median : 1.740   Median :34.52  
##  Mean   :92.71   Mean   :62.63   Mean   : 2.791   Mean   :35.15  
##  3rd Qu.:95.91   3rd Qu.:71.59   3rd Qu.: 3.520   3rd Qu.:44.26  
##  Max.   :99.00   Max.   :96.36   Max.   :39.890   Max.   :82.13  
##                                                                  
##  MedYrHousBuilt PctHousNoPhone   PctWOFullPlumb   OwnOccLowQuart  
##  Min.   :1939   Min.   : 0.000   Min.   :0.0000   Min.   : 15700  
##  1st Qu.:1956   1st Qu.: 0.980   1st Qu.:0.1800   1st Qu.: 41800  
##  Median :1964   Median : 3.090   Median :0.3300   Median : 65900  
##  Mean   :1963   Mean   : 4.446   Mean   :0.4377   Mean   : 91116  
##  3rd Qu.:1971   3rd Qu.: 7.080   3rd Qu.:0.5700   3rd Qu.:126800  
##  Max.   :1987   Max.   :23.630   Max.   :5.3300   Max.   :500001  
##                                                                   
##   OwnOccMedVal    OwnOccHiQuart     OwnOccQrange       RentLowQ     
##  Min.   : 26600   Min.   : 36700   Min.   :     0   Min.   :  99.0  
##  1st Qu.: 56700   1st Qu.: 74800   1st Qu.: 32925   1st Qu.: 210.0  
##  Median : 84600   Median :109500   Median : 44250   Median : 305.0  
##  Mean   :116102   Mean   :149007   Mean   : 57891   Mean   : 328.1  
##  3rd Qu.:156250   3rd Qu.:192850   3rd Qu.: 67475   3rd Qu.: 420.0  
##  Max.   :500001   Max.   :500001   Max.   :331000   Max.   :1001.0  
##                                                                     
##    RentMedian       RentHighQ        RentQrange       MedRent      
##  Min.   : 120.0   Min.   : 182.0   Min.   :  0.0   Min.   : 192.0  
##  1st Qu.: 286.0   1st Qu.: 361.2   1st Qu.:139.0   1st Qu.: 363.0  
##  Median : 394.0   Median : 484.0   Median :173.0   Median : 467.0  
##  Mean   : 428.4   Mean   : 528.4   Mean   :200.3   Mean   : 502.7  
##  3rd Qu.: 547.8   3rd Qu.: 667.8   3rd Qu.:241.0   3rd Qu.: 621.0  
##  Max.   :1001.0   Max.   :1001.0   Max.   :803.0   Max.   :1001.0  
##                                                                    
##  MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters     
##  Min.   :14.90     Min.   :14.10    Min.   :10.10         Min.   :    0.00  
##  1st Qu.:24.30     1st Qu.:19.10    1st Qu.:11.90         1st Qu.:    0.00  
##  Median :26.20     Median :21.20    Median :12.80         Median :    0.00  
##  Mean   :26.33     Mean   :21.21    Mean   :13.03         Mean   :   67.72  
##  3rd Qu.:28.10     3rd Qu.:23.30    3rd Qu.:13.80         3rd Qu.:   24.00  
##  Max.   :35.10     Max.   :32.70    Max.   :23.40         Max.   :23383.00  
##                                                                             
##    NumStreet        PctForeignBorn   PctBornSameState PctSameHouse85 
##  Min.   :    0.00   Min.   : 0.180   Min.   : 6.75    Min.   :11.83  
##  1st Qu.:    0.00   1st Qu.: 2.080   1st Qu.:48.87    1st Qu.:44.68  
##  Median :    0.00   Median : 4.490   Median :62.52    Median :51.87  
##  Mean   :   18.71   Mean   : 7.606   Mean   :60.50    Mean   :51.32  
##  3rd Qu.:    1.00   3rd Qu.: 9.585   3rd Qu.:74.38    3rd Qu.:58.51  
##  Max.   :10447.00   Max.   :60.400   Max.   :93.14    Max.   :78.56  
##                                                                      
##  PctSameCity85   PctSameState85     LandArea          PopDens     
##  Min.   :27.95   Min.   :32.83   Min.   :   0.90   Min.   :   10  
##  1st Qu.:71.92   1st Qu.:84.73   1st Qu.:   7.40   1st Qu.: 1171  
##  Median :79.31   Median :89.64   Median :  13.70   Median : 1996  
##  Mean   :77.11   Mean   :87.73   Mean   :  27.96   Mean   : 2790  
##  3rd Qu.:84.70   3rd Qu.:92.73   3rd Qu.:  25.77   3rd Qu.: 3270  
##  Max.   :96.59   Max.   :99.90   Max.   :3569.80   Max.   :44230  
##                                                                   
##  PctUsePubTrans   LemasPctOfficDrugUn ViolentCrimesPerPop
##  Min.   : 0.000   Min.   : 0.00       Min.   :   0.0     
##  1st Qu.: 0.350   1st Qu.: 0.00       1st Qu.: 161.7     
##  Median : 1.220   Median : 0.00       Median : 374.1     
##  Mean   : 3.063   Mean   : 1.01       Mean   : 589.1     
##  3rd Qu.: 3.377   3rd Qu.: 0.00       3rd Qu.: 794.4     
##  Max.   :54.330   Max.   :48.44       Max.   :4877.1     
## 
par(mfrow=c(2,4))
for (name in names(cleaned_CC)){
  hist(cleaned_CC[ ,name], col ='gray80', main =paste(name), xlab ='', las = 1)
}

We notice that many of the variables have a skewed distribution as depicted by the shapes of the histograms (this indicates that we might have to scale our data which will be addressed later). Having seen how each variable behaves on its own, we can even go one step further and examine how each variable behaves with respect to one another. Learning which variables are correlated can help us gain intuition behind why certain variables can be dropped. Below is a matrix of correlations (since the matrix is really large, we will use the head function to just view a small portion of it).

head(cor(cleaned_CC))
##                population householdsize racepctblack racePctWhite racePctAsian
## population     1.00000000   -0.01998124   0.11985409   -0.1691664   0.09020787
## householdsize -0.01998124    1.00000000  -0.05641415   -0.2427810   0.19010502
## racepctblack   0.11985409   -0.05641415   1.00000000   -0.8038940  -0.09006910
## racePctWhite  -0.16916638   -0.24278100  -0.80389404    1.0000000  -0.28996778
## racePctAsian   0.09020787    0.19010502  -0.09006910   -0.2899678   1.00000000
## racePctHisp    0.08698228    0.50089241  -0.07905903   -0.4141658   0.19388596
##               racePctHisp  agePct12t21 agePct12t29 agePct16t24  agePct65up
## population     0.08698228 -0.008922560  0.04603586  0.01748179 -0.04535452
## householdsize  0.50089241  0.486331416  0.37694662  0.30814265 -0.57727020
## racepctblack  -0.07905903  0.089593377  0.11905687  0.08754291  0.03501101
## racePctWhite  -0.41416581 -0.140683973 -0.21562068 -0.12389602  0.13680899
## racePctAsian   0.19388596 -0.008304097  0.07313797  0.03613463 -0.21793309
## racePctHisp    1.00000000  0.117924145  0.16616652  0.05601813 -0.20332624
##                 numbUrban     pctUrban   medIncome     pctWWage pctWFarmSelf
## population     0.99899475  0.114537414 -0.04902656 -0.007606969  -0.06468539
## householdsize -0.02052241 -0.014662170  0.18105895  0.417834151   0.16431834
## racepctblack   0.11906300  0.004526951 -0.34025272 -0.211859282  -0.14507967
## racePctWhite  -0.16957554 -0.054400107  0.29346975  0.108838173   0.09486210
## racePctAsian   0.09801341  0.227859902  0.31561770  0.259413552  -0.07956796
## racePctHisp    0.08736676  0.032882668 -0.15056995  0.009222161   0.07617366
##                pctWInvInc  pctWSocSec pctWPubAsst  pctWRetire   medFamInc
## population    -0.07920605 -0.05825732  0.10271673 -0.05905109 -0.05290946
## householdsize -0.16463318 -0.43464444  0.13502665 -0.31819863  0.09156280
## racepctblack  -0.48809256  0.10083477  0.43830333 -0.06974238 -0.34099531
## racePctWhite   0.59341547  0.07109588 -0.58551408  0.22263949  0.32767722
## racePctAsian   0.12784966 -0.31285796 -0.02913192 -0.15870111  0.29717196
## racePctHisp   -0.42104452 -0.13674881  0.44011865 -0.25614138 -0.21156285
##                 perCapInc whitePerCap blackPerCap indianPerCap AsianPerCap
## population    -0.02172376  0.03041797 -0.02529795 -0.009580455 -0.03695617
## householdsize -0.13058976 -0.12895857  0.03287737 -0.003111265 -0.05825344
## racepctblack  -0.26389697 -0.10109669 -0.19326730 -0.037701703 -0.10470790
## racePctWhite   0.28791549  0.11754179  0.12989914  0.033979863  0.13984378
## racePctAsian   0.25178076  0.28881002  0.18682018  0.060575837  0.05749987
## racePctHisp   -0.23610945 -0.20335442 -0.01488340 -0.009598256 -0.10456218
##               OtherPerCap  HispPerCap NumUnderPov PctPopUnderPov
## population             NA -0.04852552  0.98844390     0.08687736
## householdsize          NA -0.07778172 -0.01217924     0.07858127
## racepctblack           NA -0.13602581  0.15561213     0.47335079
## racePctWhite           NA  0.19643357 -0.19509753    -0.53178950
## racePctAsian           NA  0.15782486  0.05269314    -0.14309254
## racePctHisp            NA -0.21792719  0.10357355     0.34569067
##               PctLess9thGrade PctNotHSGrad  PctBSorMore PctUnemployed
## population          0.0341362   0.04520605 -0.003314509    0.08159569
## householdsize       0.2453181   0.13656123 -0.039715817    0.17072483
## racepctblack        0.2317974   0.34782645 -0.176438038    0.38426340
## racePctWhite       -0.4565570  -0.48549824  0.215510874   -0.51925170
## racePctAsian       -0.1116814  -0.18329153  0.256835320   -0.12349254
## racePctHisp         0.6399916   0.50212299 -0.258944239    0.46934622
##                 PctEmploy PctEmplManu PctEmplProfServ PctOccupManu
## population    -0.01436350 -0.05767265     0.013357012  -0.01558050
## householdsize  0.08192078  0.02998270    -0.071258377   0.08769569
## racepctblack  -0.26395386 -0.01813612     0.098149205   0.22408021
## racePctWhite   0.25107068  0.03526150     0.019681114  -0.26962446
## racePctAsian   0.19844967 -0.07100244     0.006181762  -0.22587980
## racePctHisp   -0.16450296 -0.03245830    -0.222755296   0.24086108
##               PctOccupMgmtProf MalePctDivorce MalePctNevMarr FemalePctDiv
## population        -0.006115214     0.09818571      0.1259982   0.12086191
## householdsize     -0.087931228    -0.43051333      0.1980953  -0.32918255
## racepctblack      -0.193809485     0.39593651      0.2633142   0.42418309
## racePctWhite       0.258503112    -0.33843081     -0.3475338  -0.44451726
## racePctAsian       0.229080506    -0.10700149      0.1657728  -0.02724117
## racePctHisp       -0.289367195     0.02119139      0.1504020   0.14757123
##               TotalPctDiv  PersPerFam PctFam2Par PctKids2Par PctYoungKids2Par
## population      0.1117944  0.05483077 -0.1436528  -0.1477879       -0.1176591
## householdsize  -0.3920254  0.83427003  0.2593835   0.1848487        0.1785928
## racepctblack    0.4273592  0.04352565 -0.6987393  -0.7308858       -0.6560017
## racePctWhite   -0.4057335 -0.39621600  0.6410727   0.7020485        0.6020396
## racePctAsian   -0.0706462  0.20304202  0.1264430   0.1056274        0.1603422
## racePctHisp     0.0858183  0.65260879 -0.1247147  -0.1933914       -0.1290443
##               PctTeen2Par PctWorkMomYoungKids  PctWorkMom NumKidsBornNeverMar
## population    -0.13880404        -0.048911713 -0.07832886          0.96919546
## householdsize  0.24793655        -0.225003314 -0.27454503         -0.01382065
## racepctblack  -0.68928536         0.172996713  0.08223719          0.18128400
## racePctWhite   0.61672564         0.006455434  0.15530871         -0.20145070
## racePctAsian   0.10197458        -0.107829302 -0.13979304          0.04170485
## racePctHisp   -0.08922259        -0.285707381 -0.41993518          0.06913143
##               PctKidsBornNeverMar    NumImmig PctImmigRecent PctImmigRec5
## population              0.1918587  0.93698544     0.05969798   0.07400148
## householdsize           0.0424901  0.02446960     0.04096913   0.06311299
## racepctblack            0.8045245  0.04345595     0.15684744   0.18712843
## racePctWhite           -0.7982256 -0.12413352    -0.21928174  -0.28559015
## racePctAsian           -0.0445678  0.11977194     0.14309005   0.17894882
## racePctHisp             0.2177592  0.12170301     0.06053260   0.11772738
##               PctImmigRec8 PctImmigRec10 PctRecentImmig PctRecImmig5
## population      0.09096036     0.1055186     0.14177309   0.14457513
## householdsize   0.07525378     0.1082819     0.29947542   0.32161070
## racepctblack    0.21907883     0.2431958    -0.04405507  -0.03533397
## racePctWhite   -0.34792615    -0.4104704    -0.39610968  -0.41945931
## racePctAsian    0.24065764     0.2711773     0.57871625   0.57841982
## racePctHisp     0.15336367     0.2252240     0.60912950   0.63959849
##               PctRecImmig8 PctRecImmig10 PctSpeakEnglOnly PctNotSpeakEnglWell
## population      0.15028179    0.15128376       -0.1120853           0.1169453
## householdsize   0.32768383    0.34005924       -0.4508026           0.4539802
## racepctblack   -0.03133008   -0.02679911        0.1010608          -0.0536535
## racePctWhite   -0.43596004   -0.44658474        0.4046849          -0.4338322
## racePctAsian    0.61794808    0.61283928       -0.4069128           0.3197273
## racePctHisp     0.64653593    0.66857076       -0.9142703           0.8923207
##               PctLargHouseFam PctLargHouseOccup PersPerOccupHous
## population          0.1007336        0.06256921      -0.01239029
## householdsize       0.6842431        0.74163139       0.87884973
## racepctblack        0.1454162        0.07447286      -0.10083585
## racePctWhite       -0.5431399       -0.46718488      -0.22907158
## racePctAsian        0.2181128        0.20671411       0.21429846
## racePctHisp         0.7590884        0.73735224       0.55762262
##               PersPerOwnOccHous PersPerRentOccHous PctPersOwnOccup
## population          0.006475298         0.02142278     -0.15269723
## householdsize       0.812931820         0.73248303      0.10591993
## racepctblack       -0.149839069         0.09853665     -0.36660021
## racePctWhite       -0.134574764        -0.46963328      0.50394418
## racePctAsian        0.198028064         0.23973473     -0.09457199
## racePctHisp         0.472150928         0.66649194     -0.30369714
##               PctPersDenseHous PctHousLess3BR    MedNumBR  HousVacant
## population           0.1146375     0.13519473 -0.11160762  0.92506684
## householdsize        0.5498120    -0.29198804  0.18800102 -0.09101512
## racepctblack         0.1121861     0.22738771 -0.13896697  0.18344377
## racePctWhite        -0.5924452    -0.36473283  0.28306768 -0.20031790
## racePctAsian         0.2985607     0.04941892 -0.07194458  0.03235154
## racePctHisp          0.8643040     0.32426794 -0.30307948  0.07268417
##               PctHousOccup PctHousOwnOcc PctVacantBoarded PctVacMore6Mos
## population     -0.02107012   -0.15552193       0.14496181   -0.032799859
## householdsize   0.21639449    0.16091300       0.05906204   -0.035390554
## racepctblack   -0.18598389   -0.33264409       0.47508949    0.165820489
## racePctWhite    0.12343669    0.43829466      -0.44598166    0.005199846
## racePctAsian    0.18929321   -0.07703753      -0.11257058   -0.329860451
## racePctHisp    -0.05624573   -0.24200338       0.15654664   -0.139280497
##               MedYrHousBuilt PctHousNoPhone PctWOFullPlumb OwnOccLowQuart
## population       -0.05245494     0.03540845     0.07661476     0.01309797
## householdsize     0.23576331     0.01158037     0.17729330     0.10948380
## racepctblack     -0.08891917     0.46355039     0.25372037    -0.26722203
## racePctWhite      0.02160577    -0.47222110    -0.37774264     0.08656218
## racePctAsian      0.06365386    -0.25752141    -0.05578593     0.51168731
## racePctHisp       0.08225862     0.29802016     0.42852090     0.04405002
##               OwnOccMedVal OwnOccHiQuart OwnOccQrange      RentLowQ
## population      0.02186620    0.03763290  0.073552241 -0.0005833201
## householdsize   0.09842372    0.08492910  0.027318381  0.1372430501
## racepctblack   -0.25182349   -0.23572706 -0.139428213 -0.3078488853
## racePctWhite    0.07195308    0.06216770  0.008856967  0.1228098864
## racePctAsian    0.50903905    0.50198842  0.396457596  0.5055629358
## racePctHisp     0.04921780    0.04620829  0.041787865  0.0371101404
##                 RentMedian   RentHighQ  RentQrange      MedRent
## population     0.001442043  0.01469968  0.03545392 -0.007353084
## householdsize  0.153729058  0.16773905  0.16168116  0.165025196
## racepctblack  -0.293102246 -0.28356527 -0.14531907 -0.269213448
## racePctWhite   0.123818603  0.11901415  0.07178750  0.123516168
## racePctAsian   0.480286908  0.46678128  0.24122397  0.448396868
## racePctHisp    0.026300909  0.03252414  0.01362826  0.001199020
##               MedRentPctHousInc MedOwnCostPctInc MedOwnCostPctIncNoMtg
## population           0.06105356       0.04605370          -0.007638429
## householdsize        0.13527095       0.24506090          -0.100861914
## racepctblack         0.19330933      -0.06033315           0.217050630
## racePctWhite        -0.33748613      -0.17496918          -0.065463790
## racePctAsian         0.15282841       0.34479739          -0.204903686
## racePctHisp          0.26945343       0.31486880          -0.095597760
##               NumInShelters   NumStreet PctForeignBorn PctBornSameState
## population       0.93032799  0.92103806     0.13311280      -0.06776498
## householdsize   -0.03336532 -0.01217337     0.33805885      -0.06011572
## racepctblack     0.11256032  0.06126485    -0.09464446       0.09589610
## racePctWhite    -0.14207182 -0.09979246    -0.37735606       0.11169349
## racePctAsian     0.06594532  0.06870114     0.59404355      -0.34866914
## racePctHisp      0.05269424  0.05869189     0.69121593      -0.23186484
##               PctSameHouse85 PctSameCity85 PctSameState85      LandArea
## population       -0.03340836    0.01686028  -0.0281548497  0.1963973642
## householdsize    -0.07219896   -0.12507445  -0.0497000116 -0.0049557789
## racepctblack     -0.03431448    0.06374576  -0.0006933544  0.0403043730
## racePctWhite      0.15740343   -0.02602928   0.0328709611 -0.0457945937
## racePctAsian     -0.14974903   -0.14166234  -0.1480290209  0.0005761101
## racePctHisp      -0.12105218    0.04648453   0.0163896829 -0.0052344098
##                   PopDens PctUsePubTrans LemasPctOfficDrugUn
## population     0.21394125     0.31966609          0.20139178
## householdsize  0.03158455    -0.05373092         -0.08377101
## racepctblack   0.09431397     0.17293606          0.24596833
## racePctWhite  -0.32032835    -0.22887554         -0.25760969
## racePctAsian   0.29643645     0.22793683          0.06364783
## racePctHisp    0.36858895     0.08412792          0.07654516
##               ViolentCrimesPerPop
## population             0.21235381
## householdsize         -0.02011018
## racepctblack           0.62836776
## racePctWhite          -0.67684882
## racePctAsian           0.03194948
## racePctHisp            0.25359641

Regression Task

Our goal is to develop a model to predict “ViolentCrimesPerPop”

Since there are so many variables in our dataset, we hope to find an appropriate model using dimension reduction methods or shrinkage methods so that we can predict our desired response in a “friendly” way. In particular, we will use methods such as PC regression, PLS regression, Ridge regression, and Lasso regression. Addtionally, we will use cross-validation to select tuning parameters and utilize the three-way holdout method to perform model selection and model assessment.

We begin by defining our response vector and feature matrix. Also, recall that there is one column (“OtherPerCap”) which contains a single missing value of which we chose to keep. To deal with this missing value, we will impute the missing value with its respective column mean.

#response vector
y <- cleaned_CC$ViolentCrimesPerPop
response <- "y"

#feature matrix with replaced missing value
X <- subset(cleaned_CC, select = -c(ViolentCrimesPerPop))
X$OtherPerCap[is.na(X$OtherPerCap)] <- mean(X$OtherPerCap, na.rm = TRUE)
predictors <- names(X)

#combine feature matrix and response vector for convenience
dat <- as.data.frame(cbind(X,y))

Three-way hold-out method

Before we begin implementing any of the regularization methods, we first split the data into three different parts:
1. Training set: 60% of the data (chosen at random)
2. Validation set: 20% of the data (i.e. one half of the remaining 40% not in training, chosen at random)
3. Test set: 20% of the data (i.e. the other half of the remaining 40% not in training, chosen at random)

set.seed(1)
train_prop <- 0.6
total <- nrow(dat)

#training set
train <- sample(1:total, floor(total*train_prop))
testval <- c(1:total)[-train]

#validation set
validation <- sample(testval, size = floor(length(testval)/2))

#test set
test <- c(1:total)[-c(train, validation)]

PCA

After successfully prepping our data, we standardize it in order to ensure that the significance of each variable is properly captured relative to one another. The first thing that we would like to do is attempt to reduce the dimensionality by performing PCA.

#standardize feature matrix
X <- scale(X)

#compute variance-covariance matrix
n <- nrow(X)
S <- (1/(n-1)) * t(as.matrix(X)) %*% as.matrix(X)

#perform eigenvalue decomposition
EVD <- eigen(S)

#eigenvalues of S (the variances of each PC)
evalues <- EVD$values
head(evalues)
## [1] 24.343530 16.298249  9.134802  7.922403  6.406749  4.377326
#eigenvectors of S (loadings)
evectors <- EVD$vectors
head(evectors)
##              [,1]         [,2]        [,3]         [,4]        [,5]
## [1,] -0.026577636  0.065099858  0.14579982  0.184552471  0.25791833
## [2,]  0.004568478  0.131887283 -0.19031340 -0.083595496  0.14163395
## [3,] -0.102306123  0.006402538  0.08253572  0.028133461 -0.02983111
## [4,]  0.109552493 -0.123011775 -0.02356669 -0.024602880  0.04069955
## [5,]  0.047371094  0.144805274  0.03377442 -0.009803065 -0.04907106
## [6,] -0.066472116  0.172216454 -0.12716760  0.022964970 -0.01394782
##              [,6]        [,7]         [,8]         [,9]       [,10]       [,11]
## [1,]  0.017920849  0.04300980 -0.007308468  0.009270909 -0.01752143  0.01658662
## [2,] -0.092275175 -0.06945836 -0.066195076 -0.077128587 -0.04949304  0.08685429
## [3,] -0.033944385 -0.33980979 -0.072452598  0.005294221 -0.08859645  0.13479344
## [4,]  0.002376779  0.24042461  0.079127005 -0.037051010  0.04986506 -0.11844207
## [5,]  0.049042273  0.02310039  0.041174163  0.056528524  0.01405839  0.14956301
## [6,]  0.016825353  0.11177358 -0.065674167  0.017439324  0.07143277 -0.12478334
##            [,12]       [,13]       [,14]       [,15]       [,16]        [,17]
## [1,]  0.02417096 -0.00448563 -0.02054244 -0.01101617  0.01064100 -0.010208790
## [2,]  0.05320303 -0.01817154  0.04133420 -0.02792225 -0.05029930 -0.005984993
## [3,] -0.11358764 -0.11989426  0.21991616 -0.03876096 -0.05829349 -0.014462900
## [4,]  0.06307187  0.18213135 -0.14092898  0.02204136  0.05965231 -0.011517265
## [5,]  0.12567539 -0.16057485 -0.03681884  0.01337671 -0.12607560  0.139261012
## [6,] -0.05238597 -0.08298295 -0.07303860  0.00633800  0.10536800 -0.064508144
##             [,18]        [,19]         [,20]       [,21]        [,22]
## [1,]  0.012335047  0.008066279 -0.0006572833  0.01659684 -0.001498696
## [2,]  0.005616012  0.024631501 -0.0098504806 -0.08147366  0.059616601
## [3,] -0.021166101 -0.051420843  0.0044448381  0.03361956 -0.082371854
## [4,]  0.094606541  0.028545366  0.0256470516 -0.10357670  0.057285983
## [5,] -0.368869950  0.136818197 -0.1125299892  0.40007712 -0.034907653
## [6,]  0.103539563 -0.078308040 -0.0072723209 -0.04762336  0.055394553
##             [,23]        [,24]        [,25]        [,26]        [,27]
## [1,] -0.004304147  0.009933551  0.016030376  0.004640294 -0.009244075
## [2,] -0.042951091 -0.052796559 -0.077902485  0.032137798  0.004682224
## [3,]  0.177470262 -0.021563738  0.032877321  0.017927307 -0.066890249
## [4,] -0.173724157  0.027148074 -0.018740252 -0.026497756  0.099104044
## [5,]  0.096386783  0.027101988 -0.135746170 -0.117658885 -0.242247503
## [6,] -0.035728242  0.071051067 -0.002599899  0.016553087  0.140606586
##            [,28]       [,29]        [,30]       [,31]        [,32]       [,33]
## [1,]  0.01224319 -0.02381306  0.004598176 -0.02509612 -0.005139776 -0.01579424
## [2,] -0.03292691 -0.05412032  0.054488233  0.03172119  0.097821856 -0.02885663
## [3,] -0.08472306  0.02996253  0.035017291 -0.01015335 -0.004234876  0.19607923
## [4,]  0.10006823  0.04050458 -0.063466950 -0.02164266  0.044015928 -0.11529137
## [5,] -0.18529955 -0.03588542 -0.095768115  0.02631169 -0.011947679 -0.35861926
## [6,]  0.03135547 -0.15842536  0.132196520  0.06262484 -0.112271073  0.02118127
##             [,34]        [,35]        [,36]       [,37]       [,38]       [,39]
## [1,]  0.002647875 -0.007538536 -0.008024600 -0.01472617  0.01532305 -0.03279090
## [2,]  0.089952283  0.014390879 -0.050565445  0.07829235 -0.01354886  0.10814440
## [3,]  0.078259478 -0.116064876  0.053349695  0.06356608  0.11547131 -0.17954767
## [4,] -0.053360982  0.019965303  0.017147220 -0.03558513 -0.15230900  0.18953870
## [5,] -0.022223962  0.094632339  0.002748044 -0.09177312 -0.08614383 -0.08123203
## [6,] -0.009500068  0.121085398 -0.094885050  0.02055191  0.15719307 -0.10550619
##            [,40]       [,41]       [,42]       [,43]       [,44]       [,45]
## [1,]  0.03938087  0.01414469 -0.05741996  0.08223016  0.00287909  0.03512732
## [2,] -0.11904009  0.13202800 -0.05745720  0.12837136 -0.08353355 -0.15623845
## [3,]  0.01356902 -0.15992657 -0.10272887  0.06854872 -0.13588351 -0.09664213
## [4,] -0.13498945  0.05555136  0.02599796 -0.07538281  0.05756422  0.21995838
## [5,]  0.10382761  0.13636457  0.12889909  0.01332579  0.11072867 -0.10722184
## [6,]  0.18613851  0.17453111  0.11316429  0.04719531  0.09406834 -0.14040867
##            [,46]        [,47]       [,48]        [,49]       [,50]       [,51]
## [1,] -0.09208309 -0.008014516 -0.02614619 -0.023247759 -0.01595838 -0.01391626
## [2,]  0.02254294 -0.167543636 -0.12697078  0.016487177 -0.21233950  0.17841960
## [3,]  0.06009740  0.059699318  0.05337934  0.004733562 -0.05733635  0.04460733
## [4,] -0.01140995 -0.040932350 -0.17112909  0.045943860 -0.02616258 -0.01103662
## [5,] -0.04427226 -0.074888913  0.03545273  0.101399918  0.05776887 -0.15577436
## [6,]  0.01172869  0.139841036  0.15907314 -0.238334209 -0.04606000  0.17378738
##            [,52]        [,53]       [,54]       [,55]        [,56]       [,57]
## [1,] -0.08546778  0.055491757  0.05098578 -0.04218347 -0.016462811  0.00358729
## [2,]  0.30988896  0.002275641  0.16817313 -0.16816489  0.171070473  0.04861430
## [3,]  0.03859940  0.162938780 -0.06028919  0.07356701  0.022912234  0.01708291
## [4,] -0.03853784 -0.051991648 -0.02661532 -0.05800092 -0.107937072 -0.08145447
## [5,] -0.02013483 -0.053583447  0.04106358 -0.07741997 -0.022233053 -0.05592863
## [6,] -0.05144988 -0.046973997  0.03194063  0.06019650 -0.004860287 -0.08282579
##            [,58]        [,59]        [,60]       [,61]       [,62]       [,63]
## [1,] -0.00598417  0.033847177 -0.002253565 -0.01918254 -0.01942746  0.01205204
## [2,] -0.05948579  0.228635163 -0.044489180  0.09824357  0.08518101  0.13713744
## [3,] -0.07465325 -0.075434074 -0.073696451  0.01171874 -0.14042785 -0.05718693
## [4,]  0.00302883 -0.009003527 -0.044732715  0.04357659  0.16870732 -0.09845800
## [5,] -0.02813981  0.031045785  0.094647529 -0.02304627  0.11724966 -0.02220296
## [6,] -0.08541486  0.037179623  0.033837258 -0.04670896  0.01659687  0.01312556
##              [,64]       [,65]        [,66]        [,67]       [,68]
## [1,]  0.0193433090 -0.01007596 -0.008759277  0.036588780 -0.03507907
## [2,]  0.0008658202 -0.10301268  0.100263015 -0.001539129  0.02960534
## [3,] -0.1362979196 -0.18156326  0.074904983 -0.067149629  0.02149141
## [4,] -0.1507786298 -0.10370603  0.069060940 -0.148617185 -0.04340101
## [5,]  0.0264535029  0.01535314 -0.020576799 -0.021054941 -0.01480823
## [6,]  0.0621279896  0.10933976 -0.210319544  0.255603187 -0.03139649
##            [,69]         [,70]       [,71]       [,72]        [,73]
## [1,]  0.01681108 -0.0031317617 -0.01799005  0.03155739  0.061873322
## [2,] -0.07585506  0.0530965236 -0.06316023  0.03123251 -0.013741543
## [3,]  0.02782936 -0.0002594133  0.02629149  0.01546533 -0.036253282
## [4,]  0.08266508  0.0126065223 -0.01669295 -0.02517491 -0.009506757
## [5,] -0.04706139 -0.0206470179  0.02225068 -0.02188407 -0.056592960
## [6,]  0.05698817 -0.0319708319 -0.01593871  0.01643605 -0.028572526
##             [,74]        [,75]       [,76]        [,77]       [,78]
## [1,] -0.071716716  0.040418870 -0.07116238 -0.010167522 -0.10881923
## [2,] -0.189098138 -0.006858964  0.21504799 -0.389885210  0.05285137
## [3,] -0.053522116 -0.009368924 -0.09265473  0.043871395 -0.13967937
## [4,] -0.005148836  0.054709405  0.01100692  0.005541169 -0.12136304
## [5,] -0.053152678 -0.061828640  0.04519890  0.009302802 -0.05326495
## [6,] -0.062233447  0.069696700  0.12540680  0.046476732  0.03960356
##             [,79]        [,80]        [,81]       [,82]        [,83]
## [1,]  0.101997711 -0.072849287 -0.028313428  0.46409781 -0.009196756
## [2,]  0.001181653  0.008829965 -0.047619067  0.01364931  0.013420261
## [3,]  0.090392083 -0.227656660 -0.008063569 -0.19458386 -0.186374083
## [4,]  0.040586701 -0.321383915 -0.026871904 -0.18564652 -0.197329649
## [5,]  0.013096439 -0.070204488  0.031742092 -0.07778473 -0.052279059
## [6,] -0.132901885 -0.060663887  0.337004526 -0.06214878 -0.178214854
##            [,84]       [,85]      [,86]        [,87]      [,88]        [,89]
## [1,] -0.08265288 -0.01570085 0.01060036  0.004893220 0.01182844  0.194270122
## [2,] -0.06020545  0.08419881 0.05414943  0.180025100 0.03906225 -0.008069885
## [3,] -0.41340174 -0.01049703 0.08694862 -0.016737024 0.04628290  0.052443598
## [4,] -0.42613970 -0.04718964 0.11504841  0.032455641 0.02060317  0.041509420
## [5,] -0.14059341  0.03095404 0.10808799 -0.004715453 0.01164414  0.025370632
## [6,] -0.20301483  0.11493089 0.20617902 -0.049874803 0.01360616  0.062448865
##              [,90]       [,91]        [,92]        [,93]        [,94]
## [1,]  0.0951919643 0.010476679  0.020437789 -0.007806031  0.009477949
## [2,] -0.0358581498 0.006957098  0.020341569 -0.010408041 -0.001017762
## [3,] -0.0005616237 0.057065263 -0.014759082 -0.032051504 -0.040322506
## [4,]  0.0559086203 0.057536178 -0.032966279 -0.068948886 -0.031878264
## [5,] -0.0124768349 0.016957818 -0.009194687 -0.013737141 -0.040722084
## [6,] -0.0282286047 0.058613320 -0.003037201 -0.034661777 -0.011213528
##             [,95]        [,96]         [,97]         [,98]        [,99]
## [1,] -0.006723752  0.007160879  0.0086604257  5.342341e-03  0.058083089
## [2,]  0.011061589  0.012545210 -0.0055693641 -1.421555e-02 -0.015048108
## [3,] -0.019926131 -0.020567838 -0.0133201903 -2.131431e-03  0.004255648
## [4,] -0.044595208 -0.018484079 -0.0131029244 -7.091007e-03  0.002633125
## [5,] -0.016973724 -0.022581098  0.0072926885 -3.685843e-03 -0.002139027
## [6,] -0.004650588 -0.005709771 -0.0002122037  2.573254e-06 -0.004727481
##           [,100]        [,101]       [,102]
## [1,] 0.708202898 -1.936865e-13 0.000000e+00
## [2,] 0.004830813  9.010694e-14 2.554240e-14
## [3,] 0.002653897 -2.256466e-14 5.473834e-14
## [4,] 0.003067335 -8.354587e-16 3.844750e-14
## [5,] 0.001612752  1.971652e-14 1.877639e-14
## [6,] 0.002455213  2.709642e-14 9.350326e-15
#principal components (each PC is a linear combination of each of the original features)
pcs <- X %*% evectors
colnames(pcs) <-paste0('PC', 1:ncol(pcs))
head(pcs[,1:4])
##         PC1        PC2        PC3       PC4
## 1 11.137044  2.3507545 -1.7893031  1.807111
## 2  6.343140 -1.6726233 -1.8967130  2.255265
## 3  2.555256 -1.2286008  1.8883608 -2.155570
## 4 -4.796155 -4.3618166 -0.8038962  2.299482
## 5 -2.752111 -2.4552902  2.5989538 -2.062738
## 6  4.699144 -0.0739867  0.2789733  1.227586
#high level summary of doing PCA
pc <- princomp(X, scores = TRUE)
summary(pc)
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4     Comp.5
## Standard deviation     4.9326789 4.0360965 3.02162558 2.81397046 2.53052082
## Proportion of Variance 0.2386621 0.1597868 0.08955688 0.07767062 0.06281126
## Cumulative Proportion  0.2386621 0.3984488 0.48800569 0.56567631 0.62848757
##                            Comp.6    Comp.7     Comp.8     Comp.9    Comp.10
## Standard deviation     2.09168135 1.8823422 1.72071511 1.47110777 1.27572454
## Proportion of Variance 0.04291496 0.0347548 0.02904261 0.02122788 0.01596362
## Cumulative Proportion  0.67140253 0.7061573 0.73519995 0.75642783 0.77239145
##                           Comp.11    Comp.12    Comp.13    Comp.14    Comp.15
## Standard deviation     1.26639012 1.21586732 1.20654739 1.04850815 1.01969247
## Proportion of Variance 0.01573087 0.01450074 0.01427928 0.01078354 0.01019897
## Cumulative Proportion  0.78812232 0.80262306 0.81690234 0.82768588 0.83788485
##                            Comp.16     Comp.17     Comp.18     Comp.19
## Standard deviation     0.989307306 0.976271643 0.937987851 0.914970955
## Proportion of Variance 0.009600196 0.009348868 0.008630026 0.008211685
## Cumulative Proportion  0.847485042 0.856833910 0.865463936 0.873675622
##                            Comp.20     Comp.21     Comp.22     Comp.23
## Standard deviation     0.875911957 0.865512451 0.850786030 0.780493078
## Proportion of Variance 0.007525556 0.007347918 0.007100001 0.005975246
## Cumulative Proportion  0.881201178 0.888549096 0.895649097 0.901624343
##                            Comp.24     Comp.25     Comp.26    Comp.27
## Standard deviation     0.769692094 0.728701908 0.708472385 0.70300746
## Proportion of Variance 0.005811012 0.005208558 0.004923382 0.00484772
## Cumulative Proportion  0.907435354 0.912643912 0.917567294 0.92241501
##                            Comp.28     Comp.29    Comp.30    Comp.31
## Standard deviation     0.686829233 0.668760688 0.65251133 0.62607171
## Proportion of Variance 0.004627168 0.004386914 0.00417632 0.00384473
## Cumulative Proportion  0.927042182 0.931429096 0.93560542 0.93945015
##                            Comp.32     Comp.33     Comp.34     Comp.35
## Standard deviation     0.614053732 0.603214708 0.579554833 0.565048753
## Proportion of Variance 0.003698541 0.003569123 0.003294631 0.003131768
## Cumulative Proportion  0.943148688 0.946717811 0.950012441 0.953144209
##                            Comp.36     Comp.37     Comp.38    Comp.39   Comp.40
## Standard deviation     0.538861573 0.520473095 0.508690135 0.50557907 0.4844127
## Proportion of Variance 0.002848211 0.002657139 0.002538191 0.00250724 0.0023017
## Cumulative Proportion  0.955992420 0.958649559 0.961187750 0.96369499 0.9659967
##                            Comp.41     Comp.42    Comp.43     Comp.44
## Standard deviation     0.481134399 0.465598776 0.44850228 0.435680860
## Proportion of Variance 0.002270652 0.002126382 0.00197309 0.001861893
## Cumulative Proportion  0.968267341 0.970393723 0.97236681 0.974228707
##                           Comp.45     Comp.46     Comp.47    Comp.48
## Standard deviation     0.42850672 0.420775769 0.397115542 0.38737787
## Proportion of Variance 0.00180108 0.001736677 0.001546862 0.00147193
## Cumulative Proportion  0.97602979 0.977766464 0.979313325 0.98078526
##                            Comp.49     Comp.50    Comp.51     Comp.52
## Standard deviation     0.364008940 0.352446149 0.34947045 0.322576014
## Proportion of Variance 0.001299696 0.001218437 0.00119795 0.001020662
## Cumulative Proportion  0.982084952 0.983303389 0.98450134 0.985522001
##                            Comp.53      Comp.54      Comp.55      Comp.56
## Standard deviation     0.321055708 0.3142163085 0.3052621659 0.2853168174
## Proportion of Variance 0.001011064 0.0009684454 0.0009140367 0.0007984954
## Cumulative Proportion  0.986533064 0.9875015095 0.9884155462 0.9892140416
##                             Comp.57      Comp.58      Comp.59      Comp.60
## Standard deviation     0.2737585853 0.2690977660 0.2599109128 0.2561377774
## Proportion of Variance 0.0007351114 0.0007102935 0.0006626233 0.0006435243
## Cumulative Proportion  0.9899491531 0.9906594466 0.9913220699 0.9919655942
##                             Comp.61      Comp.62      Comp.63      Comp.64
## Standard deviation     0.2529292131 0.2397154092 0.2362733634 0.2248870324
## Proportion of Variance 0.0006275028 0.0005636501 0.0005475795 0.0004960741
## Cumulative Proportion  0.9925930970 0.9931567471 0.9937043267 0.9942004007
##                             Comp.65      Comp.66      Comp.67      Comp.68
## Standard deviation     0.2173380176 0.2080171493 0.1986859627 0.1951771870
## Proportion of Variance 0.0004633286 0.0004244397 0.0003872149 0.0003736593
## Cumulative Proportion  0.9946637293 0.9950881690 0.9954753839 0.9958490432
##                             Comp.69      Comp.70      Comp.71      Comp.72
## Standard deviation     0.1856936330 0.1800098925 0.1784278362 0.1727328029
## Proportion of Variance 0.0003382297 0.0003178414 0.0003122791 0.0002926627
## Cumulative Proportion  0.9961872728 0.9965051142 0.9968173933 0.9971100560
##                             Comp.73      Comp.74      Comp.75      Comp.76
## Standard deviation     0.1659213685 0.1619551449 0.1518608298 0.1510578655
## Proportion of Variance 0.0002700364 0.0002572807 0.0002262087 0.0002238228
## Cumulative Proportion  0.9973800924 0.9976373731 0.9978635817 0.9980874045
##                             Comp.77      Comp.78      Comp.79      Comp.80
## Standard deviation     0.1443429543 0.1389526415 0.1326806473 0.1262231991
## Proportion of Variance 0.0002043661 0.0001893875 0.0001726763 0.0001562774
## Cumulative Proportion  0.9982917706 0.9984811581 0.9986538345 0.9988101118
##                             Comp.81      Comp.82      Comp.83      Comp.84
## Standard deviation     0.1235752103 0.1212765575 0.1170376777 0.1110082965
## Proportion of Variance 0.0001497892 0.0001442685 0.0001343597 0.0001208728
## Cumulative Proportion  0.9989599010 0.9991041695 0.9992385292 0.9993594020
##                             Comp.85      Comp.86      Comp.87      Comp.88
## Standard deviation     0.1057593846 1.009005e-01 8.309721e-02 7.888263e-02
## Proportion of Variance 0.0001097123 9.986286e-05 6.773148e-05 6.103522e-05
## Cumulative Proportion  0.9994691143 9.995690e-01 9.996367e-01 9.996977e-01
##                             Comp.89      Comp.90      Comp.91      Comp.92
## Standard deviation     0.0745019065 7.400372e-02 6.520519e-02 6.049967e-02
## Proportion of Variance 0.0000544443 5.371861e-05 4.170442e-05 3.590241e-05
## Cumulative Proportion  0.9997521882 9.998059e-01 9.998476e-01 9.998835e-01
##                             Comp.93      Comp.94      Comp.95      Comp.96
## Standard deviation     5.818417e-02 4.970809e-02 4.677072e-02 4.339695e-02
## Proportion of Variance 3.320683e-05 2.423661e-05 2.145684e-05 1.847294e-05
## Cumulative Proportion  9.999167e-01 9.999410e-01 9.999624e-01 9.999809e-01
##                             Comp.97      Comp.98      Comp.99     Comp.100
## Standard deviation     3.192576e-02 2.233391e-02 1.563978e-02 1.363459e-02
## Proportion of Variance 9.997699e-06 4.892685e-06 2.399269e-06 1.823483e-06
## Cumulative Proportion  9.999909e-01 9.999958e-01 9.999982e-01 1.000000e+00
##                            Comp.101     Comp.102
## Standard deviation     3.670186e-08 1.477783e-08
## Proportion of Variance 1.321277e-17 2.142097e-18
## Cumulative Proportion  1.000000e+00 1.000000e+00

We can summarize our findings visually with a plot which will give us a sense of what PCs are important and the amount of variance each captures.

plot(pc, type = "l")

The plot indicates that we do not need to use all of the PCs to get good results since the first couple of PCs already capture most of the variance of our original data. In fact, if we look at the cumulative proportions, we notice that the first 17 PCs already capture roughly 85% of the total variance and the first 34 captures about 95%! This is a big step forward since it shows a significant drop in the number of PCs we would need (total of 102 PCs), reducing dimensionality while still capturing as much variability of the original data as possible. We can build our model using just a few PCs and still get good results; this ultimately sets the stage for dimension reduction techniques like PCR as well as PLSR and gives us a reason as to why we should utilize them.

Principal Component Regression

In the first method, we will use Principal Component Regression (PCR). The idea is that we can fit a least squares model on a small number of PCs and obtain coefficients that give us a good model to use in predicting our response. We use the training set to fit PCR. The pcr function returns the regression coefficients in terms of the X variables, which simplifies interpretation. We can then visualize how the coefficients grow in terms of the number of retained PCs.

formula <- y ~. 
Xtrain <- scale(dat[train, c(predictors, response)])
Xtrain <- as.data.frame(Xtrain)

#PCR on training data
pcr_fit <- pcr(formula, data = Xtrain, scale = FALSE, ncomp = ncol(X))

PCR <- apply(pcr_fit$coefficients, MARGIN = 3, function(u) u)
matplot(t(PCR), type = "l", xlim = c(1, 102), ylim = c(-1, 1), ylab = "Value", xlab = "PCs")

The graph consists of a different line for each coefficient (total of 102 because there are 102 features). Each line captures how each of the coefficients change as the number of retained PCs increases. The coefficients start off stable but as the number of PCs increase, we can see that the values of the coefficients start to grow both in the postive and negative directions. We observe something interesting when the number of PCs reach 60. We can see that at that point, the values of the coefficients begin to fluctuate really rapidly, leading to significant differences between them.

Partial Least Squares Regression

The next method is Partial Least Squares Regression (PLSR). It is very similar to PCR except in PLS, it makes use of the response in order to identify new features (PLS components) that not only approximate the old features well, but also that are related to the response. The function plsr returns the regression coefficients in terms of the X variables, which simplifies interpretation yet again. Similar to what we did with PCR, We can visualize how the coefficients grow in terms of the number of retained PLS components.

#PLSR on training data
plsr_fit <-plsr(formula, data = Xtrain, scale = FALSE, ncomp = ncol(X))

PLSR <- apply(plsr_fit$coefficients, MARGIN = 3,function(u) u)
matplot(t(PLSR), type = "l", xlim = c(1, 102), ylim = c(-1.5, 1.25), ylab = "Value", xlab = "PLS Components")

The graph consists of a different line for each coefficient (total of 102 because there are 102 features). Each line captures how each of the coefficients change as the number of retained PLS components increases. Unlike the coefficients in PCR, we notice that the cofficients in PLS experience much greater changes and are much more sensitive to the number of components retained.

Ridge Regression

So far, we fitted two dimension reduction methods on our training data and observed how the regression coefficients change when we retain different numbers of components for PCR and PLSR. We can also use shrinkage methods like Ridge regression (RR) and Lasso regression to examine some interesting effects that the tuning parameter, lambda, has on the regression coefficients. This way, we will be able to have multiple sources of comparision, providing us more insight about the different regularization techniques used. Below is a graph of how the regression coefficients change in terms of lambda for Ridge regression.

#ridge regression on training data
x <- as.matrix(Xtrain[ ,predictors])
y <- Xtrain[ ,response]

ridge_fit <- glmnet(x, y, alpha = 0)

matplot(t(ridge_fit$beta), type = "l", las = 1, ylim = c(-0.1, 0.2), ylab = "Value", xlab = "Lambda")

The graph consists of a different line for each coefficient (total of 102 because there are 102 features). Each line captures how each of the coefficients change with varying values of lambda. We notice that the change in value of the cofficients for RR follow a similar shape as that of PLSR and PCR.

Lasso Regression

Lasso regression is similar to Ridge regression in the sense that both are shrinkage methods that involve the parameter, lambda. However, one disadvantage that Ridge regression has is the fact that RR will include all features (in this case 102) in the final model. Although this may not be a problem for prediction accuracy, it may make model interpretation somewhat challenging especially when the number of features we are working with is large. Lasso regression helps overcome this issue by slightly adjusting the way the penalty is set up. Below is a graph of how the regression coefficients change in terms of lambda for Lasso regression.

#lasso regression on training data
lasso_fit <- glmnet(x, y, alpha = 1)

matplot(t(lasso_fit$beta), type = "l", las = 1, ylab = "Value", xlab = "Lambda")

The graph consists of a different line for each coefficient (total of 102 because there are 102 features). Each line captures how each of the coefficients change with varying values of lambda. Again, we notice here that the change in value of the cofficients for Lasso follow a similar shape as that of PLSR and PCR. The one thing that stands out about the regression coefficients for Lasso is how sensitive it is to lambda early on.

Cross-Validation

The question to ask now is: what is the best number of components to have? What is the best lambda to use? The answers to these questions can be found by using k-fold cross validation. The size of k is somewhat arbitrary, but typically 5 or 10 is good. We will go with k=5 here. That means we will have to create folds by splitting the training data into 5 subsets of similar size and then train on the first k-1 folds and reserve the kth fold for “testing”. We repeat this process for each fold, training on all folds except for the one that has been reserved for “testing”.

#create folds
folds <- 5
if ((length(train)%%folds)==0){
  cuts <- seq(1,length(train), by = length(train)/folds)
} else {
    cuts <- seq(1,length(train), by = floor(length(train)/folds))
    cuts[folds+1] <-length(train)
}

#starting and ending positions of folds
fold_starts <- cuts[-length(cuts)]
fold_ends <- c(cuts[2:(length(cuts)-1)]-1, cuts[length(cuts)])

folds_list <-as.list(1:folds)
for(fold in 1:folds) {
  folds_list[[fold]] <- fold_starts[fold]:fold_ends[fold]
}

Once we have created the folds, we will train our methods on the training data and compute cross-validation MSEs for each value of a tuning parameter (the number of components and lambda are examples of tuning parameters). We note the tuning parameters that yield the lowest cross-validation MSE for each model (training MSE for each). The following code uses a for loop to do this process for PCR and PLSR. We also make use of the cv.glmnet function to do this process for Ridge regression and Lasso regression.

mse_cv_pcr <- c()
mse_cv_plsr <- c()

#pcr and plsr
for(tune in 1:ncol(X)) {
  mse_pcr_aux <- c()
  mse_plsr_aux <- c()
  for(fold in 1:folds) {
    #train with k-1 folds
    Xtrain_folds <- scale(Xtrain[folds_list[[fold]], ])
    Xtrain_folds <- as.data.frame(Xtrain_folds)
    x_aux <- as.matrix(Xtrain_folds[, predictors])
    y_aux <- Xtrain_folds[,response]
    pcr_aux <- pcr(formula, data = Xtrain_folds, scale = FALSE, ncomp = tune)
    plsr_aux <-plsr(formula, data = Xtrain_folds, scale = FALSE, ncomp = tune)
    
    # predict holdout fold
    Xfoldout <- scale(Xtrain[unlist(folds_list[-fold]), predictors])
    Xfoldout <- as.data.frame(Xfoldout)
    yfoldout <- Xtrain[unlist(folds_list[-fold]), response]
    
    pcr_hat <- predict(pcr_aux, Xfoldout, ncomp = tune)
    plsr_hat <- predict(plsr_aux, Xfoldout, ncomp = tune)
    
    #measure performance
    mse_pcr_aux[fold] <- mean((pcr_hat[,,1]-yfoldout)^2)
    mse_plsr_aux[fold] <- mean((plsr_hat[,,1]-yfoldout)^2)
  }
  mse_cv_pcr[tune] <- mean(mse_pcr_aux)
  mse_cv_plsr[tune] <- mean(mse_plsr_aux)
}

lasso <- cv.glmnet(as.matrix(Xtrain[,c(1:102)]), Xtrain[,response], alpha = 1, type.measure = "mse", nfolds = 5)
ridge <- cv.glmnet(as.matrix(Xtrain[,c(1:102)]), Xtrain[,response], alpha = 0, type.measure = "mse", nfolds = 5)

#lasso
lasso_keep <- lasso$nzero[lasso$lambda == lasso$lambda.min]

#ridge
ridge_keep <- ridge$nzero[ridge$lambda == ridge$lambda.min]

#show the best tuning parameter for each method
cv_mse <- data.frame(pcr = mse_cv_pcr, plsr = mse_cv_plsr)
selected_pcr_plsr <- apply(cv_mse, 2, which.min)
keep <- c(selected_pcr_plsr, "ridge" = ridge$lambda.min, "lasso" = lasso$lambda.min)
keep
##         pcr        plsr       ridge       lasso 
## 14.00000000  4.00000000  0.32492694  0.01223329

This table summarizes, for every method, the best number of components to keep as well as the best lambda to choose (determined by the lowest cross-validation MSE for each). Now that we have found the best tuning parameters for each model, the next question to ask is: which model is the best one? We will use the validation set to determine the answer. In the next step, we will fit our ideal models (those with the best tuning parameters that we just found for each) on our validation set and compare how each model does.

Xval <- scale(dat[validation, c(predictors, response)])
x_val <- as.matrix(Xval[,predictors])
y_val <- Xval[, response]

pcr_hat <- predict(pcr_fit, x_val, ncomp = keep[1])
plsr_hat <- predict(plsr_fit, x_val, ncomp = keep[2])
las_hat <- predict(lasso_fit, newx = x_val, s = keep[3])
rr_hat <- predict(ridge_fit, newx = x_val, s = keep[4])

mse_val <- c(
  "pcr" = mean((pcr_hat[,,1]-y_val)^2),
  "plsr" = mean((plsr_hat[,,1]-y_val)^2),
  "ridge" = mean((rr_hat[,1]-y_val)^2),
  "lasso" = mean((las_hat[,1]-y_val)^2)
)

mse_val
##       pcr      plsr     ridge     lasso 
## 0.3624519 0.3588764 0.3476880 0.5369280
which.min(mse_val)
## ridge 
##     3

The model that gives us the lowest validation MSE is the best model out of the bunch; in our case, Ridge Regression yields the lowest validation MSE. Once we have identified the best model, we need to assess its performance using the test set.

Final Model and its Performance

#test the performance of the best model
Xbest <- scale(dat[c(train, validation),c(predictors, response)])
Xtest <- scale(dat[test,c(predictors, response)])
x_test <- as.matrix(Xtest[, predictors])
y_test <- Xtest[,response]

ridge_best <- glmnet(Xbest[,predictors], Xbest[,response], alpha = 0, lambda = keep[4])
y_hat_test <- predict(ridge_fit, newx = x_test, s = keep[4])
mean((y_hat_test-y_test)^2)
## [1] 0.331372

The resulting quantity above shows the test MSE for our best model (Ridge regression). We note that the test MSE is greater than the validation MSE, but by a very small amount. We can conclude that our model did a fairly good job in predicting “ViolentCrimesPerPop”. For our last step, we will use our entire dataset (training set, validation set, and test set) to fit our crowned model and observe its coefficients.

Xall <- scale(dat)
ridge_model <- glmnet(Xall[,predictors], Xall[,response], alpha = 0, lambda = keep[4])
ridge_model$beta
## 102 x 1 sparse Matrix of class "dgCMatrix"
##                                  s0
## population            -0.0491638432
## householdsize         -0.0296575851
## racepctblack           0.1360736818
## racePctWhite          -0.0711683487
## racePctAsian          -0.0270917885
## racePctHisp            0.0008447537
## agePct12t21            0.0474468570
## agePct12t29           -0.1631222262
## agePct16t24            0.0149398041
## agePct65up            -0.0220666184
## numbUrban             -0.0662126258
## pctUrban               0.0782010845
## medIncome              0.0025570737
## pctWWage              -0.1230538106
## pctWFarmSelf           0.0204778255
## pctWInvInc            -0.0846178506
## pctWSocSec             0.0441072657
## pctWPubAsst            0.0578071799
## pctWRetire            -0.0769737452
## medFamInc             -0.0105337681
## perCapInc             -0.0488610155
## whitePerCap           -0.0144640937
## blackPerCap           -0.0117364695
## indianPerCap          -0.0015510813
## AsianPerCap            0.0332978613
## OtherPerCap            0.0333035847
## HispPerCap             0.0116231328
## NumUnderPov           -0.0294094010
## PctPopUnderPov        -0.0899341263
## PctLess9thGrade       -0.1630281039
## PctNotHSGrad           0.1008233839
## PctBSorMore            0.0348004401
## PctUnemployed         -0.0287455401
## PctEmploy              0.0927356630
## PctEmplManu           -0.0577608813
## PctEmplProfServ       -0.0143637236
## PctOccupManu           0.0273220063
## PctOccupMgmtProf       0.0263912145
## MalePctDivorce         0.1572377452
## MalePctNevMarr         0.1145609615
## FemalePctDiv          -0.0852828205
## TotalPctDiv           -0.0272195207
## PersPerFam            -0.0223951462
## PctFam2Par            -0.0268378529
## PctKids2Par           -0.1644580429
## PctYoungKids2Par       0.0111063241
## PctTeen2Par           -0.0018010256
## PctWorkMomYoungKids    0.0302839580
## PctWorkMom            -0.1053712451
## NumKidsBornNeverMar   -0.0722701022
## PctKidsBornNeverMar    0.2087152419
## NumImmig               0.0605466888
## PctImmigRecent         0.0224669666
## PctImmigRec5          -0.0130889453
## PctImmigRec8          -0.0122816049
## PctImmigRec10          0.0177751472
## PctRecentImmig        -0.0145640151
## PctRecImmig5          -0.0368117995
## PctRecImmig8          -0.0078361943
## PctRecImmig10          0.0283682677
## PctSpeakEnglOnly       0.0125285763
## PctNotSpeakEnglWell   -0.0456592215
## PctLargHouseFam        0.0081864184
## PctLargHouseOccup     -0.0760607938
## PersPerOccupHous       0.1568150205
## PersPerOwnOccHous     -0.0288321659
## PersPerRentOccHous    -0.0266881439
## PctPersOwnOccup       -0.1048253263
## PctPersDenseHous       0.1333692545
## PctHousLess3BR         0.0404593300
## MedNumBR               0.0117927734
## HousVacant             0.1516442887
## PctHousOccup          -0.0348632729
## PctHousOwnOcc          0.0523937702
## PctVacantBoarded       0.0767204224
## PctVacMore6Mos        -0.0480007455
## MedYrHousBuilt         0.0020052541
## PctHousNoPhone         0.0324747898
## PctWOFullPlumb        -0.0062906739
## OwnOccLowQuart        -0.0079906551
## OwnOccMedVal           0.0160187159
## OwnOccHiQuart         -0.0139603868
## OwnOccQrange          -0.0249140880
## RentLowQ              -0.0999785921
## RentMedian             0.0014985799
## RentHighQ             -0.0533925818
## RentQrange             0.0431898420
## MedRent                0.1232714127
## MedRentPctHousInc      0.0121121724
## MedOwnCostPctInc      -0.0103880424
## MedOwnCostPctIncNoMtg -0.0744301120
## NumInShelters          0.0789941307
## NumStreet             -0.0039622510
## PctForeignBorn         0.1349230872
## PctBornSameState      -0.0249750695
## PctSameHouse85         0.0069380749
## PctSameCity85          0.0259725524
## PctSameState85         0.0148322542
## LandArea               0.0020638694
## PopDens               -0.0531134884
## PctUsePubTrans        -0.0041650996
## LemasPctOfficDrugUn    0.0451312598